diff --git "a/training_artifacts/logs/pipeline_cleaned.txt" "b/training_artifacts/logs/pipeline_cleaned.txt"
--- "a/training_artifacts/logs/pipeline_cleaned.txt"
+++ "b/training_artifacts/logs/pipeline_cleaned.txt"
@@ -3,7 +3,7 @@ Job Name: lf_torch_test__interactive
 Hostname: gl064.hpc.nyu.edu
 Number of nodes: 2
 GPUs per node: 2
-Start Time: Wed Oct 22 04:01:29 PM EDT 2025
+Start Time: Wed Oct 22 04:24:25 PM EDT 2025
 Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
 ========================================
 Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
@@ -34,7 +34,7 @@ Master node (this node) will now join training as rank 0
 
 ========================================
 STAGE 1: Training Model
-Start Time: Wed Oct 22 04:01:31 PM EDT 2025
+Start Time: Wed Oct 22 04:24:28 PM EDT 2025
 ========================================
 Multi-node training detected
 Nodes: 2, GPUs per node: 2
@@ -63,19 +63,19 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default
   import pkg_resources
 /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
   import pkg_resources
-[INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
-[INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
-[INFO|2025-10-22 16:01:48] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:48,643 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:01:48,814 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:01:49,018 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:01:49,020 >> Model config Qwen2Config {
+[INFO|2025-10-22 16:24:45] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
+[INFO|2025-10-22 16:24:45] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
+[INFO|2025-10-22 16:24:45] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,173 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:24:45,342 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:24:45,580 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:24:45,581 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -131,82 +131,132 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:01:49,085 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:01:49,252 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|2025-10-22 16:01:49] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
-gl065:3752807:3752807 [1] NCCL INFO cudaDriverVersion 13000
-gl065:3752807:3752807 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3752807:3752807 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
-gl065:3752807:3752807 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl065:3752807:3752807 [1] NCCL INFO Comm config Blocking set to 1
-gl065:3752807:3752930 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl065:3752807:3752930 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl065:3752807:3752930 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3752807:3752930 [1] NCCL INFO NCCL_IB_HCA set to mlx5
-gl065:3752807:3752930 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
-gl065:3752807:3752930 [1] NCCL INFO Initialized NET plugin IB
-gl065:3752807:3752930 [1] NCCL INFO Assigned NET plugin IB to comm
-gl065:3752807:3752930 [1] NCCL INFO Using network IB
-gl065:3752807:3752930 [1] NCCL INFO ncclCommInitRankConfig comm 0x14fa4e70 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x503fff15b9ea778b - Init START
-[rank2]:[W1022 16:01:49.405201816 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
-gl065:3752806:3752806 [0] NCCL INFO cudaDriverVersion 13000
-gl065:3752806:3752806 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3752806:3752806 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
-gl065:3752806:3752806 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl065:3752806:3752806 [0] NCCL INFO Comm config Blocking set to 1
-gl065:3752806:3752942 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl065:3752806:3752942 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl065:3752806:3752942 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3752806:3752942 [0] NCCL INFO NCCL_IB_HCA set to mlx5
-gl065:3752806:3752942 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
-gl065:3752806:3752942 [0] NCCL INFO Initialized NET plugin IB
-gl065:3752806:3752942 [0] NCCL INFO Assigned NET plugin IB to comm
-gl065:3752806:3752942 [0] NCCL INFO Using network IB
-gl065:3752806:3752942 [0] NCCL INFO ncclCommInitRankConfig comm 0x12f77100 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x503fff15b9ea778b - Init START
-gl065:3752807:3752930 [1] NCCL INFO RAS client listening socket at ::1<28028>
-gl065:3752806:3752942 [0] NCCL INFO RAS client listening socket at ::1<28028>
-gl065:3752807:3752930 [1] NCCL INFO Bootstrap timings total 0.320778 (create 0.000028, send 0.000429, recv 0.000956, ring 0.002613, delay 0.000000)
-gl065:3752806:3752942 [0] NCCL INFO Bootstrap timings total 0.012332 (create 0.000024, send 0.000508, recv 0.001014, ring 0.001299, delay 0.000000)
-gl065:3752807:3752930 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
-gl065:3752806:3752942 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
-gl065:3752806:3752942 [0] NCCL INFO comm 0x12f77100 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
-gl065:3752807:3752930 [1] NCCL INFO comm 0x14fa4e70 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
-gl065:3752806:3752942 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
-gl065:3752806:3752942 [0] NCCL INFO P2P Chunksize set to 131072
-gl065:3752807:3752930 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
-gl065:3752807:3752930 [1] NCCL INFO P2P Chunksize set to 131072
-gl065:3752806:3752942 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl065:3752807:3752930 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl065:3752807:3752949 [1] NCCL INFO [Proxy Service] Device 1 CPU core 3
-gl065:3752807:3752951 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 4
-gl065:3752806:3752950 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 13
-gl065:3752806:3752948 [0] NCCL INFO [Proxy Service] Device 0 CPU core 12
-gl065:3752807:3752930 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl065:3752807:3752930 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl065:3752806:3752942 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl065:3752806:3752942 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl065:3752807:3752930 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl065:3752807:3752930 [1] NCCL INFO ncclCommInitRankConfig comm 0x14fa4e70 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x503fff15b9ea778b - Init COMPLETE
-gl065:3752806:3752942 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl065:3752806:3752942 [0] NCCL INFO ncclCommInitRankConfig comm 0x12f77100 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x503fff15b9ea778b - Init COMPLETE
-gl065:3752807:3752930 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.45 (kernels 0.08, alloc 0.01, bootstrap 0.32, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl065:3752806:3752942 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.13 (kernels 0.08, alloc 0.01, bootstrap 0.01, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl065:3752806:3752952 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/IB/0
-gl065:3752806:3752952 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/IB/0
-gl065:3752806:3752954 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 15
-gl065:3752806:3752952 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
-gl065:3752806:3752952 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
-gl065:3752807:3752953 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/IB/0
-gl065:3752807:3752953 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/IB/0
-gl065:3752807:3752955 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 9
-gl065:3752807:3752953 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-gl065:3752806:3752952 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:24:45,647 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:24:45,812 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-10-22 16:24:45] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
+Converting format of dataset:   0%|          | 0/54000 [00:00<?, ? examples/s]Converting format of dataset:   4%|         | 2210/54000 [00:00<00:02, 19782.05 examples/s]Converting format of dataset:   9%|         | 4746/54000 [00:00<00:02, 22909.12 examples/s]Converting format of dataset:  13%|        | 7206/54000 [00:00<00:02, 22781.88 examples/s]Converting format of dataset:  18%|        | 9724/54000 [00:00<00:01, 23689.66 examples/s]Converting format of dataset:  23%|       | 12266/54000 [00:00<00:01, 23544.38 examples/s]Converting format of dataset:  28%|       | 14930/54000 [00:00<00:01, 24555.15 examples/s]Converting format of dataset:  34%|      | 18480/54000 [00:00<00:01, 23941.71 examples/s]Converting format of dataset:  40%|      | 21530/54000 [00:01<00:01, 17980.46 examples/s]Converting format of dataset:  44%|     | 24000/54000 [00:01<00:01, 19126.28 examples/s]Converting format of dataset:  49%|     | 26549/54000 [00:01<00:01, 20617.95 examples/s]Converting format of dataset:  54%|    | 29000/54000 [00:01<00:01, 21260.33 examples/s]Converting format of dataset:  58%|    | 31499/54000 [00:01<00:01, 22201.58 examples/s]Converting format of dataset:  63%|   | 34000/54000 [00:01<00:00, 22467.28 examples/s]Converting format of dataset:  68%|   | 36481/54000 [00:01<00:00, 23103.17 examples/s]Converting format of dataset:  72%|  | 39000/54000 [00:01<00:00, 23121.10 examples/s]Converting format of dataset:  77%|  | 41434/54000 [00:01<00:00, 23461.41 examples/s]Converting format of dataset:  81%|  | 43851/54000 [00:01<00:00, 23663.85 examples/s]Converting format of dataset:  88%| | 47464/54000 [00:02<00:00, 23508.69 examples/s]Converting format of dataset:  93%|| 50000/54000 [00:02<00:00, 23475.57 examples/s]Converting format of dataset:  97%|| 52417/54000 [00:02<00:00, 23656.32 examples/s]Converting format of dataset: 100%|| 54000/54000 [00:03<00:00, 16651.38 examples/s]
+[rank2]:[W1022 16:24:49.341564362 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
+gl065:3767293:3767293 [1] NCCL INFO cudaDriverVersion 13000
+gl065:3767293:3767293 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3767293:3767293 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
+gl065:3767293:3767293 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl065:3767292:3767292 [0] NCCL INFO cudaDriverVersion 13000
+gl065:3767292:3767292 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3767292:3767292 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
+gl065:3767292:3767292 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl065:3767293:3767293 [1] NCCL INFO Comm config Blocking set to 1
+gl065:3767292:3767292 [0] NCCL INFO Comm config Blocking set to 1
+gl065:3767292:3767436 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
+gl065:3767292:3767436 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl065:3767293:3767435 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
+gl065:3767293:3767435 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl065:3767293:3767435 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3767292:3767436 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3767293:3767435 [1] NCCL INFO NCCL_IB_HCA set to mlx5
+gl065:3767292:3767436 [0] NCCL INFO NCCL_IB_HCA set to mlx5
+gl065:3767292:3767436 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
+gl065:3767293:3767435 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
+gl065:3767292:3767436 [0] NCCL INFO Initialized NET plugin IB
+gl065:3767293:3767435 [1] NCCL INFO Initialized NET plugin IB
+gl065:3767292:3767436 [0] NCCL INFO Assigned NET plugin IB to comm
+gl065:3767293:3767435 [1] NCCL INFO Assigned NET plugin IB to comm
+gl065:3767292:3767436 [0] NCCL INFO Using network IB
+gl065:3767293:3767435 [1] NCCL INFO Using network IB
+gl065:3767293:3767435 [1] NCCL INFO ncclCommInitRankConfig comm 0x13f074c0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x7fab6aa5c74dddef - Init START
+gl065:3767292:3767436 [0] NCCL INFO ncclCommInitRankConfig comm 0x14d6bca0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x7fab6aa5c74dddef - Init START
+gl065:3767292:3767436 [0] NCCL INFO RAS client listening socket at ::1<28028>
+gl065:3767293:3767435 [1] NCCL INFO RAS client listening socket at ::1<28028>
+gl065:3767293:3767435 [1] NCCL INFO Bootstrap timings total 0.004044 (create 0.000031, send 0.000643, recv 0.001672, ring 0.000784, delay 0.000000)
+gl065:3767292:3767436 [0] NCCL INFO Bootstrap timings total 0.018859 (create 0.000024, send 0.000642, recv 0.000873, ring 0.002112, delay 0.000000)
+gl065:3767292:3767436 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
+gl065:3767293:3767435 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
+gl065:3767292:3767436 [0] NCCL INFO comm 0x14d6bca0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
+gl065:3767293:3767435 [1] NCCL INFO comm 0x13f074c0 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
+gl065:3767292:3767436 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
+gl065:3767292:3767436 [0] NCCL INFO P2P Chunksize set to 131072
+gl065:3767293:3767435 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
+gl065:3767293:3767435 [1] NCCL INFO P2P Chunksize set to 131072
+gl065:3767293:3767435 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+gl065:3767292:3767436 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+gl065:3767293:3767441 [1] NCCL INFO [Proxy Service] Device 1 CPU core 13
+gl065:3767292:3767442 [0] NCCL INFO [Proxy Service] Device 0 CPU core 14
+gl065:3767293:3767443 [1] NCCL INFO [Proxy Service UDS] Device 1 Clibnccl-profiler.so. 
+gl064:2379162:2379274 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+gl064:2379163:2379280 [1] NCCL INFO [Proxy Service] Device 1 CPU core 8
+gl064:2379163:2379282 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 10
+gl064:2379162:2379281 [0] NCCL INFO [Proxy Service] Device 0 CPU core 9
+gl064:2379162:2379283 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 5
+gl064:2379163:2379275 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl064:2379163:2379275 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl064:2379162:2379274 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl064:2379162:2379274 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl064:2379162:2379274 [0] NCCL INFO CC Off, workFifoBytes 1048576
+gl064:2379163:2379275 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl064:2379163:2379275 [1] NCCL INFO ncclCommInitRankConfig comm 0x1332eab0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x7fab6aa5c74dddef - Init COMPLETE
+gl064:2379163:2379275 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.14 (kernels 0.08, alloc 0.01, bootstrap 0.01, allgathers 0.03, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
+gl064:2379162:2379274 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl064:2379162:2379274 [0] NCCL INFO ncclCommInitRankConfig comm 0x16c656e0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x7fab6aa5c74dddef - Init COMPLETE
+gl064:2379162:2379274 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.15 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
+gl064:2379162:2379284 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
+gl064:2379162:2379286 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 4
+gl064:2379162:2379284 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
+gl064:2379162:2379284 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
+gl064:2379162:2379284 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
+gl064:2379163:2379285 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
+gl064:2379163:2379285 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
+gl064:2379163:2379287 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 12
+gl064:2379162:2379284 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+gl064:2379163:2379285 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+Running tokenizer on dataset:   0%|          | 0/54000 [00:00<?, ? examples/s]Running tokenizer on dataset:   2%|         | 1000/54000 [00:02<02:18, 383.00 examples/s]recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 940, in <module>
+    main()
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 936, in main
+    run(args)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run
+    elastic_launch(
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 293, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/src/train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-10-22_16:24:53
+  host      : gl065.hpc.nyu.edu
+  rank      : 2 (local_rank: 0)
+  exitcode  : -7 (pid: 3767292)
+  error_file: <N/A>
+  traceback : Signal 7 (SIGBUS) received by PID 3767292
+============================================================
+
+========================================
+ERROR: Training failed with exit code 1
+========================================
+
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl065.hpc.nyu.edu
+Process cleanup complete
+Running tokenizer on dataset:   4%|         | 2000/54000 [00:05<02:16, 379.92 examples/s]Running tokenizer on dataset:   6%|         | 3000/54000 [00:07<02:16, 374.98 examples/s]Running tokenizer on dataset:   7%|         | 4000/54000 [00:10<02:11, 379.65 examples/s]Running tokenizer on dataset:   9%|         | 5000/54000 [00:13<02:11, 373.08 examples/s]Running tokenizer on dataset:  11%|         | 6000/54000 [00:15<02:06, 378.22 examples/s]Running tokenizer on dataset:  13%|        | 7000/54000 [00:18<02:04, 376.24 examples/s]Running tokenizer on dataset:  15%|        | 8000/54000 [00:21<02:03, 372.63 examples/s]Running tokenizer on dataset:  17%|        | 9000/54000 [00:23<02:00, 372.63 examples/s]Running tokenizer on dataset:  19%|        | 10000/54000 [00:26<01:58, 371.93 examples/s]Running tokenizer on dataset:  20%|        | 11000/54000 [00:29<01:55, 373.04 examples/s]Running tokenizer on dataset:  22%|       | 12000/54000 [00:32<01:52, 372.71 examples/s]Running tokenizer on dataset:  24%|       | 13000/54000 [00:34<01:49, 374.53 examples/s]Running tokenizer on dataset:  26%|       | 14000/54000 [00:37<01:46, 376.59 examples/s]Running tokenizer on dataset:  28%|       | 15000/54000 [00:40<01:44, 374.29 examples/s]Running tokenizer on dataset:  30%|       | 16000/54000 [00:42<01:40, 378.29 examples/s]Running tokenizer on dataset:  31%|      | 17000/54000 [00:45<01:38, 375.49 examples/s]Running tokenizer on dataset:  33%|      | 18000/54000 [00:47<01:36, 374.96 examples/s]Running tokenizer on dataset:  35%|      | 19000/54000 [00:50<01:34, 372.26 examples/s]Running tokenizer on dataset:  37%|      | 20000/54000 [00:53<01:30, 375.03 examples/s]Running tokenizer on dataset:  39%|      | 21000/54000 [00:56<01:28, 373.91 examples/s]Running tokenizer on dataset:  41%|      | 22000/54000 [00:58<01:25, 372.99 examples/s]Running tokenizer on dataset:  43%|     | 23000/54000 [01:01<01:22, 376.13 examples/s]Running tokenizer on dataset:  44%|     | 24000/54000 [01:04<01:21, 366.97 examples/s]Running tokenizer on dataset:  46%|     | 25000/54000 [01:06<01:18, 371.65 examples/s]Running tokenizer on dataset:  48%|     | 26000/54000 [01:09<01:14, 373.38 examples/s]Running tokenizer on dataset:  50%|     | 27000/54000 [01:12<01:12, 372.37 examples/s]Running tokenizer on dataset:  52%|    | 28000/54000 [01:14<01:09, 375.06 examples/s]Running tokenizer on dataset:  54%|    | 29000/54000 [01:17<01:06, 374.52 examples/s]Running tokenizer on dataset:  56%|    | 30000/54000 [01:20<01:03, 377.02 examples/s]Running tokenizer on dataset:  57%|    | 31000/54000 [01:22<01:01, 374.82 examples/s]Running tokenizer on dataset:  59%|    | 32000/54000 [01:25<00:58, 373.69 examples/s]Running tokenizer on dataset:  61%|    | 33000/54000 [01:28<00:56, 373.30 examples/s]Running tokenizer on dataset:  63%|   | 34000/54000 [01:30<00:52, 378.10 examples/s]Running tokenizer on dataset:  65%|   | 35000/54000 [01:33<00:50, 375.87 examples/s]Running tokenizer on dataset:  67%|   | 36000/54000 [01:36<00:47, 377.91 examples/s]Running tokenizer on dataset:  69%|   | 37000/54000 [01:38<00:44, 378.86 examples/s]Running tokenizer on dataset:  70%|   | 38000/54000 [01:41<00:42, 379.78 examples/s]Running tokenizer on dataset:  72%|  | 39000/54000 [01:44<00:39, 375.36 examples/s]Running tokenizer on dataset:  74%|  | 40000/54000 [01:46<00:37, 376.67 examples/s]Running tokenizer on dataset:  76%|  | 41000/54000 [01:49<00:34, 376.41 examples/s]Running tokenizer on dataset:  78%|  | 42000/54000 [01:52<00:32, 374.50 examples/s]Running tokenizer on dataset:  80%|  | 43000/54000 [01:54<00:29, 371.42 examples/s]Running tokenizer on dataset:  81%| | 44000/54000 [01:57<00:26, 371.04 examples/s]Running tokenizer on dataset:  83%| | 45000/54000 [02:00<00:24, 373.19 examples/s]Running tokenizer on dataset:  85%| | 46000/54000 [02:02<00:21, 375.94 examples/s]Running tokenizer on dataset:  87%| | 47000/54000 [02:05<00:18, 377.09 examples/s]Running tokenizer on dataset:  89%| | 48000/54000 [02:08<00:15, 375.61 examples/s]Running tokenizer on dataset:  91%| | 49000/54000 [02:10<00:13, 373.72 examples/s]Running tokenizer on dataset:  93%|| 50000/54000 [02:13<00:10, 377.28 examples/s]Running tokenizer on dataset:  94%|| 51000/54000 [02:16<00:07, 376.66 examples/s]Running tokenizer on dataset:  96%|| 52000/54000 [02:18<00:05, 374.82 examples/s]Running tokenizer on dataset:  98%|| 53000/54000 [02:21<00:02, 372.06 examples/s]Running tokenizer on dataset: 100%|| 54000/54000 [02:24<00:00, 370.88 examples/s]Running tokenizer on dataset: 100%|| 54000/54000 [02:24<00:00, 373.84 examples/s]
 training example:
 input_ids:
 [33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
@@ -458,195 +508,163 @@ Hence, the correct answer is:
 (67 + 31) + 71
 </answer><|endoftext|>
 
-[INFO|configuration_utils.py:765] 2025-10-22 16:01:50,482 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:01:50,483 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
 
-[INFO|2025-10-22 16:01:50] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
-[WARNING|logging.py:328] 2025-10-22 16:01:50,810 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:01:50,811 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:01:50,812 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:01:50,813 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "use_cache": false
-}
+[2025-10-22 16:28:14] gl064:2379162:2379286 [0] transport/net_ib.cc:2453 NCCL WARN NET/IB: Got completion from peer 10.0.5.1<49178> with status=12 opcode=129 len=16 vendor err 129 (Recv) hca mlx5_0
+gl064:2379162:2379286 [0] NCCL INFO transport/net.cc:1393 -> 6
+E1022 16:35:12.649000 2379105 site-packages/torch/distributed/elastic/multiprocessing/api.py:882] failed (exitcode: -15) local_rank: 0 (pid: 2379162) of binary: /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/python
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 940, in <module>
+    main()
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 936, in main
+    run(args)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run
+    elastic_launch(
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 293, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/src/train.py FAILED
+------------------------------------------------------------
+Failures:
+[1]:
+  time      : 2025-10-22_16:35:12
+  host      : gl064.hpc.nyu.edu
+  rank      : 1 (local_rank: 1)
+  exitcode  : -15 (pid: 2379163)
+  error_file: <N/A>
+  traceback : Signal 15 (SIGTERM) received by PID 2379163
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-10-22_16:35:12
+  host      : gl064.hpc.nyu.edu
+  rank      : 0 (local_rank: 0)
+  exitcode  : -15 (pid: 2379162)
+  error_file: <N/A>
+  traceback : Signal 15 (SIGTERM) received by PID 2379162
+============================================================
+
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
-`torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|configuration_utils.py:941] 2025-10-22 16:01:51,064 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:01:51,064 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048
-}
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl064.hpc.nyu.edu
+Cleaning up processes on worker node: gl065
+Process cleanup complete
+========================================
+Job Name: lf_torch_test__interactive
+Hostname: gl064.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:35:42 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
+========================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
 
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:01:51,095 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
-[INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:01:51] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
-[INFO|2025-10-22 16:01:51] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
-[INFO|2025-10-22 16:01:51] llamafactory.model.model_utils.misc:143 >> Found linear modules: gate_proj,k_proj,down_proj,o_proj,up_proj,v_proj,q_proj
-[INFO|2025-10-22 16:01:51] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
-[WARNING|trainer.py:906] 2025-10-22 16:01:51,337 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
-[INFO|trainer.py:699] 2025-10-22 16:01:51,339 >> max_steps is given, it will override any value given in num_train_epochs
-[INFO|trainer.py:749] 2025-10-22 16:01:51,339 >> Using auto half precision backend
-[WARNING|trainer.py:982] 2025-10-22 16:01:51,340 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-The model is already on multiple devices. Skipping the move to device specified in `args`.
-The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-[INFO|trainer.py:2519] 2025-10-22 16:01:51,824 >> ***** Running training *****
-[INFO|trainer.py:2520] 2025-10-22 16:01:51,824 >>   Num examples = 48,600
-[INFO|trainer.py:2521] 2025-10-22 16:01:51,824 >>   Num Epochs = 1
-[INFO|trainer.py:2522] 2025-10-22 16:01:51,824 >>   Instantaneous batch size per device = 1
-[INFO|trainer.py:2525] 2025-10-22 16:01:51,824 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:01:51,824 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:01:51,824 >>   Total optimization steps = 150
-[INFO|trainer.py:2528] 2025-10-22 16:01:51,826 >>   Number of trainable parameters = 4,399,104
-tion) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:01:51,823 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:01:51,823 >>   Total optimization steps = 150
-[INFO|trainer.py:2528] 2025-10-22 16:01:51,825 >>   Number of trainable parameters = 4,399,104
-[INFO|integration_utils.py:867] 2025-10-22 16:01:51,847 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
-wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
-wandb: Tracking run with wandb version 0.22.2
-wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_160152-f7vqjhyf
-wandb: Run `wandb offline` to turn off syncing.
-wandb: Syncing run interactive_test
-wandb:  View project at https://wandb.ai/ut_nlp_deduce/llamafactory
-wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/f7vqjhyf
-  0%|          | 0/150 [00:00<?, ?it/s]  1%|          | 1/150 [00:00<01:26,  1.72it/s]  1%|         | 2/150 [00:00<00:50,  2.95it/s]  2%|         | 3/150 [00:00<00:37,  3.88it/s]  3%|         | 4/150 [00:01<00:35,  4.09it/s]  3%|         | 5/150 [00:01<00:54,  2.68it/s]  4%|         | 6/150 [00:01<00:47,  3.02it/s]  5%|         | 7/150 [00:02<00:43,  3.27it/s]  5%|         | 8/150 [00:02<00:38,  3.65it/s]  6%|         | 9/150 [00:02<00:35,  3.92it/s]  7%|         | 10/150 [00:02<00:32,  4.32it/s]                                                {'loss': 0.8092, 'grad_norm': 0.4081718623638153, 'learning_rate': 4.7e-05, 'epoch': 0.0}
-  7%|         | 10/150 [00:02<00:32,  4.32it/s]  7%|         | 11/150 [00:03<00:31,  4.39it/s]  8%|         | 12/150 [00:03<00:41,  3.29it/s]  9%|         | 13/150 [00:03<00:35,  3.88it/s]  9%|         | 14/150 [00:03<00:30,  4.43it/s] 10%|         | 15/150 [00:04<00:28,  4.78it/s] 11%|         | 16/150 [00:04<00:37,  3.59it/s] 11%|        | 17/150 [00:04<00:36,  3.62it/s] 12%|        | 18/150 [00:04<00:35,  3.69it/s] 13%|        | 19/150 [00:05<00:35,  3.69it/s] 13%|        | 20/150 [00:05<00:32,  3.98it/s]                                                {'loss': 0.751, 'grad_norm': 0.3975396752357483, 'learning_rate': 4.3666666666666666e-05, 'epoch': 0.0}
- 13%|        | 20/150 [00:05<00:32,  3.98it/s] 14%|        | 21/150 [00:05<00:32,  3.94it/s] 15%|        | 22/150 [00:05<00:30,  4.18it/s] 15%|        | 23/150 [00:06<00:31,  4.07it/s] 16%|        | 24/150 [00:06<00:28,  4.40it/s] 17%|        | 25/150 [00:06<00:31,  4.02it/s] 17%|        | 26/150 [00:06<00:29,  4.17it/s] 18%|        | 27/150 [00:07<00:30,  4.10it/s] 19%|        | 28/150 [00:07<00:33,  3.65it/s] 19%|        | 29/150 [00:07<00:32,  3.75it/s] 20%|        | 30/150 [00:07<00:31,  3.85it/s]                                                {'loss': 0.7344, 'grad_norm': 0.46849244832992554, 'learning_rate': 4.0333333333333336e-05, 'epoch': 0.0}
- 20%|        | 30/150 [00:07<00:31,  3.85it/s] 21%|        | 31/150 [00:08<00:31,  3.83it/s] 21%|       | 32/150 [00:08<00:29,  4.05it/s] 22%|       | 33/150 [00:08<00:26,  4.43it/s] 23%|       | 34/150 [00:08<00:23,  4.87it/s] 23%|       | 35/150 [00:09<00:25,  4.58it/s] 24%|       | 36/150 [00:09<00:22,  5.04it/s] 25%|       | 37/150 [00:09<00:24,  4.71it/s] 25%|       | 38/150 [00:09<00:24,  4.67it/s] 26%|       | 39/150 [00:09<00:22,  4.98it/s] 27%|       | 40/150 [00:10<00:23,  4.58it/s]                                                {'loss': 0.7063, 'grad_norm': 0.3817349970340729, 'learning_rate': 3.7e-05, 'epoch': 0.0}
- 27%|       | 40/150 [00:10<00:23,  4.58it/s] 27%|       | 41/150 [00:10<00:26,  4.09it/s] 28%|       | 42/150 [00:10<00:26,  4.07it/s] 29%|       | 43/150 [00:10<00:23,  4.58it/s] 29%|       | 44/150 [00:10<00:21,  4.90it/s] 30%|       | 45/150 [00:11<00:19,  5.33it/s] 31%|       | 46/150 [00:11<00:20,  4.98it/s] 31%|      | 47/150 [00:11<00:21,  4.88it/s] 32%|      | 48/150 [00:11<00:19,  5.14it/s] 33%|      | 49/150 [00:12<00:22,  4.50it/s] 33%|      | 50/150 [00:12<00:22,  4.49it/s]                                                {'loss': 0.6382, 'grad_norm': 0.650374710559845, 'learning_rate': 3.366666666666667e-05, 'epoch': 0.0}
- 33%|      | 50/150 [00:12<00:22,  4.49it/s][INFO|trainer.py:4309] 2025-10-22 16:02:05,111 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
-[INFO|configuration_utils.py:765] 2025-10-22 16:02:05,262 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:02:05,263 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
 
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:05,402 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:05,406 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:05,410 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
- 34%|      | 51/150 [00:13<00:44,  2.20it/s][INFO|trainer.py:2810] 2025-10-22 16:02:29,387 >> 
 
-Training completed. Do not forget to share your model on huggingface.co/models =)
+========================================
+Multi-Node Coordination
+========================================
+This is the master node - coordinating worker nodes...
+Master node: gl064
+Master port: 29500
+World size: 2
+
+Launching on worker node 1: gl065
+All worker nodes launched successfully
+Master node (this node) will now join training as rank 0
+
+
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:35:45 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 0
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+
+Starting distributed training with torch.distributed.run...
+
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+==================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
+
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
+
+
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:35:49 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 1
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
 
+Starting distributed training with torch.distributed.run...
 
-gl065:3752807:3752807 [1] NCCL INFO comm 0x14fa4e70 rank 3 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl065:3752806:3752806 [0] NCCL INFO comm 0x12f77100 rank 2 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
-      | 58/150 [00:14<00:20,  4.51it/s] 39%|      | 59/150 [00:14<00:17,  5.19it/s] 40%|      | 60/150 [00:15<00:16,  5.57it/s]                                                {'loss': 0.6139, 'grad_norm': 0.4990316331386566, 'learning_rate': 3.0333333333333337e-05, 'epoch': 0.0}
- 40%|      | 60/150 [00:15<00:16,  5.57it/s] 41%|      | 61/150 [00:15<00:17,  5.19it/s] 41%|     | 62/150 [00:15<00:15,  5.74it/s] 42%|     | 63/150 [00:15<00:16,  5.17it/s] 43%|     | 64/150 [00:15<00:15,  5.45it/s] 43%|     | 65/150 [00:16<00:17,  4.97it/s] 44%|     | 66/150 [00:16<00:18,  4.59it/s] 45%|     | 67/150 [00:16<00:17,  4.86it/s] 45%|     | 68/150 [00:16<00:18,  4.54it/s] 46%|     | 69/150 [00:16<00:19,  4.15it/s] 47%|     | 70/150 [00:17<00:19,  4.10it/s]                                                {'loss': 0.597, 'grad_norm': 0.5236718058586121, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.01}
- 47%|     | 70/150 [00:17<00:19,  4.10it/s] 47%|     | 71/150 [00:17<00:19,  3.97it/s] 48%|     | 72/150 [00:17<00:17,  4.48it/s] 49%|     | 73/150 [00:17<00:19,  4.00it/s] 49%|     | 74/150 [00:18<00:18,  4.19it/s] 50%|     | 75/150 [00:18<00:15,  4.70it/s] 51%|     | 76/150 [00:18<00:15,  4.73it/s] 51%|    | 77/150 [00:18<00:13,  5.23it/s] 52%|    | 78/150 [00:18<00:15,  4.68it/s] 53%|    | 79/150 [00:19<00:15,  4.55it/s] 53%|    | 80/150 [00:19<00:16,  4.27it/s]                                                {'loss': 0.6205, 'grad_norm': 0.41710713505744934, 'learning_rate': 2.3666666666666668e-05, 'epoch': 0.01}
- 53%|    | 80/150 [00:19<00:16,  4.27it/s] 54%|    | 81/150 [00:19<00:14,  4.65it/s] 55%|    | 82/150 [00:19<00:16,  4.06it/s] 55%|    | 83/150 [00:20<00:15,  4.45it/s] 56%|    | 84/150 [00:20<00:15,  4.39it/s] 57%|    | 85/150 [00:20<00:14,  4.45it/s] 57%|    | 86/150 [00:20<00:12,  5.07it/s] 58%|    | 87/150 [00:20<00:12,  5.19it/s] 59%|    | 88/150 [00:21<00:12,  4.88it/s] 59%|    | 89/150 [00:21<00:13,  4.59it/s] 60%|    | 90/150 [00:21<00:11,  5.22it/s]                                                {'loss': 0.6038, 'grad_norm': 0.5673879981040955, 'learning_rate': 2.0333333333333334e-05, 'epoch': 0.01}
- 60%|    | 90/150 [00:21<00:11,  5.22it/s] 61%|    | 91/150 [00:21<00:12,  4.64it/s] 61%|   | 92/150 [00:22<00:12,  4.53it/s] 62%|   | 93/150 [00:22<00:12,  4.75it/s] 63%|   | 94/150 [00:22<00:11,  4.69it/s] 63%|   | 95/150 [00:22<00:11,  4.78it/s] 64%|   | 96/150 [00:22<00:12,  4.42it/s] 65%|   | 97/150 [00:23<00:13,  3.84it/s] 65%|   | 98/150 [00:23<00:12,  4.26it/s] 66%|   | 99/150 [00:23<00:11,  4.53it/s] 67%|   | 100/150 [00:23<00:11,  4.31it/s]                                                 {'loss': 0.5934, 'grad_norm': 0.49819639325141907, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.01}
- 67%|   | 100/150 [00:23<00:11,  4.31it/s][INFO|trainer.py:4309] 2025-10-22 16:02:16,719 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
-[INFO|configuration_utils.py:765] 2025-10-22 16:02:16,928 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:02:16,929 >> Model config Qwen2Config {
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+et `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
+[INFO|2025-10-22 16:36:02] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
+[INFO|2025-10-22 16:36:02] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,054 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:36:03,226 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:36:03,423 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:36:03,426 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -702,17 +720,32 @@ gl065:3752806:3752806 [0] NCCL INFO comm 0x12f77100 rank 2 nranks 4 cudaDev 0 bu
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:17,110 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:17,130 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:17,134 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
- 67%|   | 101/150 [00:25<00:25,  1.91it/s] 68%|   | 102/150 [00:25<00:21,  2.24it/s] 69%|   | 103/150 [00:25<00:17,  2.71it/s] 69%|   | 104/150 [00:25<00:18,  2.54it/s] 70%|   | 105/150 [00:26<00:17,  2.56it/s] 71%|   | 106/150 [00:26<00:16,  2.71it/s] 71%|  | 107/150 [00:26<00:14,  2.87it/s] 72%|  | 108/150 [00:27<00:12,  3.30it/s] 73%|  | 109/150 [00:27<00:11,  3.59it/s] 73%|  | 110/150 [00:27<00:10,  3.89it/s]                                                 {'loss': 0.5548, 'grad_norm': 0.48188939690589905, 'learning_rate': 1.3666666666666666e-05, 'epoch': 0.01}
- 73%|  | 110/150 [00:27<00:10,  3.89it/s] 74%|  | 111/150 [00:27<00:10,  3.80it/s] 75%|  | 112/150 [00:28<00:08,  4.25it/s] 75%|  | 113/150 [00:28<00:08,  4.41it/s] 76%|  | 114/150 [00:28<00:07,  4.81it/s] 77%|  | 115/150 [00:28<00:08,  4.33it/s] 77%|  | 116/150 [00:29<00:09,  3.70it/s] 78%|  | 117/150 [00:29<00:07,  4.23it/s] 79%|  | 118/150 [00:29<00:06,  4.74it/s] 79%|  | 119/150 [00:29<00:06,  4.49it/s] 80%|  | 120/150 [00:29<00:06,  4.79it/s]                                                 {'loss': 0.5132, 'grad_norm': 0.5217602252960205, 'learning_rate': 1.0333333333333333e-05, 'epoch': 0.01}
- 80%|  | 120/150 [00:29<00:06,  4.79it/s] 81%|  | 121/150 [00:30<00:06,  4.48it/s] 81%| | 122/150 [00:30<00:05,  4.81it/s] 82%| | 123/150 [00:30<00:05,  5.05it/s] 83%| | 124/150 [00:30<00:05,  4.71it/s] 83%| | 125/150 [00:30<00:05,  4.71it/s] 84%| | 126/150 [00:31<00:05,  4.01it/s] 85%| | 127/150 [00:31<00:05,  3.95it/s] 85%| | 128/150 [00:31<00:05,  4.01it/s] 86%| | 129/150 [00:31<00:05,  3.99it/s] 87%| | 130/150 [00:32<00:04,  4.54it/s]                                                 {'loss': 0.5586, 'grad_norm': 0.8095545172691345, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}
- 87%| | 130/150 [00:32<00:04,  4.54it/s] 87%| | 131/150 [00:32<00:04,  4.34it/s] 88%| | 132/150 [00:32<00:03,  4.82it/s] 89%| | 133/150 [00:32<00:03,  4.39it/s] 89%| | 134/150 [00:33<00:03,  4.06it/s] 90%| | 135/150 [00:33<00:03,  3.91it/s] 91%| | 136/150 [00:33<00:03,  4.34it/s] 91%|| 137/150 [00:33<00:02,  4.52it/s] 92%|| 138/150 [00:33<00:02,  4.40it/s] 93%|| 139/150 [00:34<00:02,  3.95it/s] 93%|| 140/150 [00:34<00:02,  4.35it/s]                                                 {'loss': 0.563, 'grad_norm': 0.4983977973461151, 'learning_rate': 3.666666666666667e-06, 'epoch': 0.01}
- 93%|| 140/150 [00:34<00:02,  4.35it/s] 94%|| 141/150 [00:34<00:02,  4.24it/s] 95%|| 142/150 [00:34<00:01,  4.53it/s] 95%|| 143/150 [00:35<00:01,  4.31it/s] 96%|| 144/150 [00:35<00:01,  4.96it/s] 97%|| 145/150 [00:35<00:01,  4.96it/s] 97%|| 146/150 [00:35<00:00,  4.70it/s] 98%|| 147/150 [00:35<00:00,  5.18it/s] 99%|| 148/150 [00:36<00:00,  5.32it/s] 99%|| 149/150 [00:36<00:00,  5.52it/s]100%|| 150/150 [00:36<00:00,  4.81it/s]                                                 {'loss': 0.5749, 'grad_norm': 0.4249863624572754, 'learning_rate': 3.3333333333333335e-07, 'epoch': 0.01}
-100%|| 150/150 [00:36<00:00,  4.81it/s][INFO|trainer.py:4309] 2025-10-22 16:02:29,334 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-[INFO|configuration_utils.py:765] 2025-10-22 16:02:29,507 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:02:29,508 >> Model config Qwen2Config {
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:03,489 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:36:03,657 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-10-22 16:36:03] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
+Converting format of dataset:   0%|          | 0/54000 [00:00<?, ? examples/s]/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  import pkg_resources
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  import pkg_resources
+[INFO|2025-10-22 16:36:05] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
+[INFO|2025-10-22 16:36:05] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
+[INFO|2025-10-22 16:36:05] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:05,686 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:36:05,861 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:36:06,075 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:36:06,076 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -768,2773 +801,648 @@ gl065:3752806:3752806 [0] NCCL INFO comm 0x12f77100 rank 2 nranks 4 cudaDev 0 bu
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:29,679 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:29,683 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:29,703 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150/special_tokens_map.json
-[INFO|trainer.py:2810] 2025-10-22 16:02:30,219 >> 
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:36:06,315 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:36:06,482 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-10-22 16:36:06] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
+Converting format of dataset:   0%|          | 0/54000 [00:00<?, ? examples/s]Converting format of dataset:   4%|         | 2415/54000 [00:00<00:02, 24034.27 examples/s]gl065:3773552:3773552 [1] NCCL INFO cudaDriverVersion 13000
+gl065:3773552:3773552 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3773552:3773552 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
+gl065:3773552:3773552 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl065:3773552:3773552 [1] NCCL INFO Comm config Blocking set to 1
+Converting format of dataset:   9%|         | 5000/54000 [00:00<00:01, 24921.93 examples/s]gl065:3773552:3773722 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
+gl065:3773552:3773722 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl065:3773552:3773722 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3773552:3773722 [1] NCCL INFO NCCL_IB_HCA set to mlx5
+gl065:3773552:3773722 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
+gl065:3773552:3773722 [1] NCCL INFO Initialized NET plugin IB
+gl065:3773552:3773722 [1] NCCL INFO Assigned NET plugin IB to comm
+gl065:3773552:3773722 [1] NCCL INFO Using network IB
+gl065:3773552:3773722 [1] NCCL INFO ncclCommInitRankConfig comm 0x171af1c0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x484db9b9964763 - Init START
+Converting format of dataset:  14%|        | 7793/54000 [00:00<00:01, 26284.65 examples/s]Converting format of dataset:  22%|       | 11803/54000 [00:00<00:01, 25921.60 examples/s]Converting format of dataset:  29%|       | 15755/54000 [00:00<00:01, 25776.83 examples/s]Converting format of dataset:  34%|      | 18400/54000 [00:00<00:01, 25601.65 examples/s]Converting format of dataset:  41%|      | 21894/54000 [00:00<00:01, 19521.02 examples/s]Converting format of dataset:  45%|     | 24392/54000 [00:01<00:01, 20602.83 examples/s]Converting format of dataset:  50%|     | 27000/54000 [00:01<00:01, 21652.51 examples/s]Converting format of dataset:  55%|    | 29759/54000 [00:01<00:01, 23135.37 examples/s]Converting format of dataset:  60%|    | 32391/54000 [00:01<00:00, 23594.98 examples/s]Converting format of dataset:  65%|   | 35000/54000 [00:01<00:00, 24027.31 examples/s]Converting format of dataset:  70%|   | 37779/54000 [00:01<00:00, 25065.76 examples/s]Converting format of dataset:  75%|  | 40401/54000 [00:01<00:00, 25115.88 examples/s]Converting format of dataset:  80%|  | 43000/54000 [00:01<00:00, 25044.95 examples/s]Converting format of dataset:  85%| | 45722/54000 [00:01<00:00, 25667.65 examples/s]Converting format of dataset:  90%| | 48384/54000 [00:02<00:00, 25420.46 examples/s]Converting format of dataset:  94%|| 51000/54000 [00:02<00:00, 25367.12 examples/s]Converting format of dataset: 100%|| 53773/54000 [00:02<00:00, 26051.53 examples/s]Converting format of dataset: 100%|| 54000/54000 [00:02<00:00, 18658.89 examples/s]
+[rank2]:[W1022 16:36:09.540270865 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
+gl065:3773551:3773551 [0] NCCL INFO cudaDriverVersion 13000
+gl065:3773551:3773551 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3773551:3773551 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
+gl065:3773551:3773551 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl065:3773551:3773551 [0] NCCL INFO Comm config Blocking set to 1
+gl065:3773551:3773758 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
+gl065:3773551:3773758 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl065:3773551:3773758 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3773551:3773758 [0] NCCL INFO NCCL_IB_HCA set to mlx5
+gl065:3773551:3773758 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
+gl065:3773551:3773758 [0] NCCL INFO Initialized NET plugin IB
+gl065:3773551:3773758 [0] NCCL INFO Assigned NET plugin IB to comm
+gl065:3773551:3773758 [0] NCCL INFO Using network IB
+gl065:3773551:3773758 [0] NCCL INFO ncclCommInitRankConfig comm 0x16784a40 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x484db9b9964763 - Init START
+gl065:3773552:3773722 [1] NCCL INFO RAS client listening socket at ::1<28028>
+gl065:3773551:3773758 [0] NCCL INFO RAS client listening socket at ::1<28028>
+gl065:3773552:3773722 [1] NCCL INFO Bootstrap timings total 2.776734 (create 0.000025, send 0.000561, recv 0.001064, ring 0.000833, delay 0.000000)
+gl065:3773551:3773758 [0] NCCL INFO Bootstrap timings total 0.011626 (create 0.000028, send 0.000530, recv 0.001099, ring 0.000519, delay 0.000000)
+gl065:3773551:3773758 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
+gl065:3773552:3773722 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
+gl065:3773551:3773758 [0] NCCL INFO comm 0x16784a40 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
+gl065:3773552:3773722 [1] NCCL INFO comm 0x171af1c0 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
+gl065:3773551:3773758 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
+gl065:3773552:3773722 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
+gl065:3773551:3773758 [0] NCCL INFO P2P Chunksize set to 131072
+gl065:3773552:3773722 [1] NCCL INFO P2P Chunksize set to 131072
+gl065:3773551:3773758 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+gl065:3773552:3773722 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+gl065:3773551:3773762 [0] NCCL INFO [Proxy Service] Device 0 CPU core 5
+gl065:3773551:3773763 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 8
+gl065:3773552:3773765 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 12
+gl065:3773552:3773764 [1] NCCL INFO [Proxy Service] Device 1 CPU core 11
+gl065:3773552:3773722 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl065:3773552:3773722 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl065:3773551:3773758 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl065:3773551:3773758 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl065:3773551:3773758 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl065:3773551:3773758 [0] NCCL INFO ncclCommInitRankConfig comm 0x16784a40 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x484db9b9964763 - Init COMPLETE
+gl065:3773551:3773758 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.13 (kernels 0.08, alloc 0.01, bootstrap 0.01, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
+gl065:3773552:3773722 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl065:3773552:3773722 [1] NCCL INFO ncclCommInitRankConfig comm 0x171af1c0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x484db9b9964763 - Init COMPLETE
+gl065:3773552:3773722 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 2.90 (kernels 0.09, alloc 0.01, bootstrap 2.78, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
+gl065:3773551:3773766 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/IB/0
+gl065:3773551:3773766 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/IB/0
+gl065:3773551:3773768 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 15
+gl065:3773551:3773766 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
+gl065:3773551:3773766 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
+gl065:3773552:3773767 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/IB/0
+gl065:3773552:3773767 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/IB/0
+gl065:3773552:3773769 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 2
+gl065:3773552:3773767 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+gl065:3773551:3773766 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+Running tokenizer on dataset:   0%|          | 0/54000 [00:00<?, ? examples/s]Running tokenizer on dataset:   2%|         | 1000/54000 [00:02<02:17, 386.06 examples/s][rank3]:[W1022 16:36:13.060773745 TCPStore.cpp:125] [c10d] recvValue failed on SocketImpl(fd=29, addr=[gl065.hpc.nyu.edu]:47362, remote=[gl064.hpc.nyu.edu]:29500): Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
+Exception raised from recvBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:697 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f0e5fab0b80 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libc10.so)
+frame #1: <unknown function> + 0x5ffd531 (0x7f0ea1eb8531 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #2: <unknown function> + 0x5ffe92d (0x7f0ea1eb992d in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #3: <unknown function> + 0x5fff4da (0x7f0ea1eba4da in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x31e (0x7f0ea1eb51fe in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #5: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x3c8 (0x7f0e60a336b8 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so)
+frame #6: <unknown function> + 0xdbbf4 (0x7f0eb7930bf4 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/../lib/libstdc++.so.6)
+frame #7: <unknown function> + 0x8a19a (0x7f0ebf68a19a in /lib64/libc.so.6)
+frame #8: <unknown function> + 0x10f240 (0x7f0ebf70f240 in /lib64/libc.so.6)
+
+[rank3]:[W1022 16:36:13.063803612 ProcessGroupNCCL.cpp:1771] [PG ID 0 PG GUID 0(default_pg) Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
+[rank2]:[W1022 16:36:13.061370737 TCPStore.cpp:125] [c10d] recvValue failed on SocketImpl(fd=29, addr=[gl065.hpc.nyu.edu]:47364, remote=[gl064.hpc.nyu.edu]:29500): Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
+Exception raised from recvBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:697 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fbeb54b3b80 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libc10.so)
+frame #1: <unknown function> + 0x5ffd531 (0x7fbef78bb531 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #2: <unknown function> + 0x5ffe92d (0x7fbef78bc92d in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #3: <unknown function> + 0x5fff4da (0x7fbef78bd4da in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x31e (0x7fbef78b81fe in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #5: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x3c8 (0x7fbeb64366b8 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so)
+frame #6: <unknown function> + 0xdbbf4 (0x7fbf0d333bf4 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/../lib/libstdc++.so.6)
+frame #7: <unknown function> + 0x8a19a (0x7fbf1508a19a in /lib64/libc.so.6)
+frame #8: <unknown function> + 0x10f240 (0x7fbf1510f240 in /lib64/libc.so.6)
+
+[rank2]:[W1022 16:36:13.064301211 ProcessGroupNCCL.cpp:1771] [PG ID 0 PG GUID 0(default_pg) Rank 2] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
+[rank3]:[W1022 16:36:14.063944764 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=29, addr=[gl065.hpc.nyu.edu]:47362, remote=[gl064.hpc.nyu.edu]:29500): Broken pipe
+Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:668 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f0e5fab0b80 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libc10.so)
+frame #1: <unknown function> + 0x5ffd531 (0x7f0ea1eb8531 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #2: <unknown function> + 0x5ffddc2 (0x7f0ea1eb8dc2 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #3: <unknown function> + 0x5fff8ce (0x7f0ea1eba8ce in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x30e (0x7f0ea1eb51ee in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #5: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x3c8 (0x7f0e60a336b8 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so)
+frame #6: <unknown function> + 0xdbbf4 (0x7f0eb7930bf4 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/../lib/libstdc++.so.6)
+frame #7: <unknown function> + 0x8a19a (0x7f0ebf68a19a in /lib64/libc.so.6)
+frame #8: <unknown function> + 0x10f240 (0x7f0ebf70f240 in /lib64/libc.so.6)
+
+[rank3]:[W1022 16:36:14.066807161 ProcessGroupNCCL.cpp:1771] [PG ID 0 PG GUID 0(default_pg) Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe
+[rank2]:[W1022 16:36:14.064432858 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=29, addr=[gl065.hpc.nyu.edu]:47364, remote=[gl064.hpc.nyu.edu]:29500): Broken pipe
+Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:668 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fbeb54b3b80 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libc10.so)
+frame #1: <unknown function> + 0x5ffd531 (0x7fbef78bb531 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #2: <unknown function> + 0x5ffddc2 (0x7fbef78bbdc2 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #3: <unknown function> + 0x5fff8ce (0x7fbef78bd8ce in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x30e (0x7fbef78b81ee in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #5: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x3c8 (0x7fbeb64366b8 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so)
+frame #6: <unknown function> + 0xdbbf4 (0x7fbf0d333bf4 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/../lib/libstdc++.so.6)
+frame #7: <unknown function> + 0x8a19a (0x7fbf1508a19a in /lib64/libc.so.6)
+frame #8: <unknown function> + 0x10f240 (0x7fbf1510f240 in /lib64/libc.so.6)
+
+[rank2]:[W1022 16:36:14.067286203 ProcessGroupNCCL.cpp:1771] [PG ID 0 PG GUID 0(default_pg) Rank 2] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe
+Running tokenizer on dataset:   4%|         | 2000/54000 [00:05<02:15, 384.62 examples/s][rank3]:[W1022 16:36:15.067049419 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=29, addr=[gl065.hpc.nyu.edu]:47362, remote=[gl064.hpc.nyu.edu]:29500): Broken pipe
+Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:668 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7f0e5fab0b80 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libc10.so)
+frame #1: <unknown function> + 0x5ffd531 (0x7f0ea1eb8531 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #2: <unknown function> + 0x5ffddc2 (0x7f0ea1eb8dc2 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #3: <unknown function> + 0x5fff8ce (0x7f0ea1eba8ce in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x30e (0x7f0ea1eb51ee in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #5: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x3c8 (0x7f0e60a336b8 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so)
+frame #6: <unknown function> + 0xdbbf4 (0x7f0eb7930bf4 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/../lib/libstdc++.so.6)
+frame #7: <unknown function> + 0x8a19a (0x7f0ebf68a19a in /lib64/libc.so.6)
+frame #8: <unknown function> + 0x10f240 (0x7f0ebf70f240 in /lib64/libc.so.6)
+
+[rank3]:[W1022 16:36:15.069966799 ProcessGroupNCCL.cpp:1771] [PG ID 0 PG GUID 0(default_pg) Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe
+[rank2]:[W1022 16:36:15.067426609 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=29, addr=[gl065.hpc.nyu.edu]:47364, remote=[gl064.hpc.nyu.edu]:29500): Broken pipe
+Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:668 (most recent call first):
+frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fbeb54b3b80 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libc10.so)
+frame #1: <unknown function> + 0x5ffd531 (0x7fbef78bb531 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #2: <unknown function> + 0x5ffddc2 (0x7fbef78bbdc2 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #3: <unknown function> + 0x5fff8ce (0x7fbef78bd8ce in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #4: c10d::TCPStore::check(std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) + 0x30e (0x7fbef78b81ee in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cpu.so)
+frame #5: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x3c8 (0x7fbeb64366b8 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/lib/libtorch_cuda.so)
+frame #6: <unknown function> + 0xdbbf4 (0x7fbf0d333bf4 in /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/../lib/libstdc++.so.6)
+frame #7: <unknown function> + 0x8a19a (0x7fbf1508a19a in /lib64/libc.so.6)
+frame #8: <unknown function> + 0x10f240 (0x7fbf1510f240 in /lib64/libc.so.6)
+
+[rank2]:[W1022 16:36:15.070285501 ProcessGroupNCCL.cpp:1771] [PG ID 0 PG GUID 0(default_pg) Rank 2] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe
+W1022 16:36:15.592000 3773515 site-packages/torch/distributed/elastic/multiprocessing/api.py:908] Sending process 3773551 closing signal SIGTERM
+E1022 16:36:15.907000 3773515 site-packages/torch/distributed/elastic/multiprocessing/api.py:882] failed (exitcode: -15) local_rank: 1 (pid: 3773552) of binary: /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/bin/python
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 940, in <module>
+    main()
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 936, in main
+    run(args)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run
+    elastic_launch(
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 293, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+/scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory/src/train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-10-22_16:36:15
+  host      : gl065.hpc.nyu.edu
+  rank      : 3 (local_rank: 1)
+  exitcode  : -15 (pid: 3773552)
+  error_file: <N/A>
+  traceback : Signal 15 (SIGTERM) received by PID 3773552
+============================================================
+
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
-Training completed. Do not forget to share your model on huggingface.co/models =)
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl065.hpc.nyu.edu
+Process cleanup complete
+========================================
+Job Name: lf_torch_test__interactive
+Hostname: gl064.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:43:05 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
+========================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
 
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
 
-                                                 {'train_runtime': 38.3946, 'train_samples_per_second': 15.627, 'train_steps_per_second': 3.907, 'train_loss': 0.6288003253936768, 'epoch': 0.01}
-100%|| 150/150 [00:37<00:00,  4.81it/s]100%|| 150/150 [00:37<00:00,  4.01it/s]
-[INFO|trainer.py:4309] 2025-10-22 16:02:30,229 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-[INFO|configuration_utils.py:765] 2025-10-22 16:02:30,323 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:02:30,323 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:30,422 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:30,426 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:30,430 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
-***** train metrics *****
-  epoch                    =     0.0123
-  total_flos               =  2243462GF
-  train_loss               =     0.6288
-  train_runtime            = 0:00:38.39
-  train_samples_per_second =     15.627
-  train_steps_per_second   =      3.907
-[INFO|modelcard.py:456] 2025-10-22 16:02:30,648 >> Dropping the following result as it does not have all the necessary fields:
-{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
-gl064:2368556:2368556 [1] NCCL INFO comm 0x15c0db00 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl064:2368555:2368555 [0] NCCL INFO comm 0x14bb0450 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
-[1;34mwandb[0m: 
-[1;34mwandb[0m:  View run [33minteractive_test[0m at: [34m[0m
-[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_160152-f7vqjhyf/logs[0m
-
-========================================
-Training completed successfully
-End Time: Wed Oct 22 04:02:32 PM EDT 2025
-========================================
-
-========================================
-STAGE 2: Merging/Exporting Model
-Start Time: Wed Oct 22 04:02:32 PM EDT 2025
-========================================
-Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Found most recent checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-Checkpoint details:
-  Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-  Last modified: 2025-10-22 16:02:30.204175325 -0400
-  Training step: 150
-Updating merge config to point to checkpoint...
-Successfully updated merge config
-Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-
-Merge config contents:
-  model_name_or_path: Qwen/Qwen2.5-0.5B
-  finetuning_type: lora
-  trust_remote_code: true
-  adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-  template: default
-  export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-
-Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,686 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:40,687 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:02:40,863 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:02:41,054 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:02:41,056 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:02:41,129 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:02:41,298 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:02:41,348 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:02:41,348 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[WARNING|logging.py:328] 2025-10-22 16:02:41,348 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|2025-10-22 16:02:41] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
-[WARNING|logging.py:328] 2025-10-22 16:02:41,741 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:02:41,742 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:02:41,743 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:02:41,743 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643
-}
-
-[INFO|configuration_utils.py:941] 2025-10-22 16:02:41,844 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:02:41,844 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048
-}
-
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:02:41,879 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:02:41] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:02:42] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
-[INFO|2025-10-22 16:02:42] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-[INFO|2025-10-22 16:02:42] llamafactory.model.loader:143 >> all params: 494,032,768
-[INFO|2025-10-22 16:02:42] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
-[INFO|configuration_utils.py:491] 2025-10-22 16:02:42,967 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
-[INFO|configuration_utils.py:757] 2025-10-22 16:02:42,971 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
-[INFO|modeling_utils.py:4181] 2025-10-22 16:02:44,581 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:02:44,587 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:02:44,591 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:02:44,595 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
-[INFO|2025-10-22 16:02:44] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
-
-========================================
-Merge/Export completed successfully
-End Time: Wed Oct 22 04:02:45 PM EDT 2025
-========================================
-
-========================================
-Preparing Training Artifacts
-========================================
-Copying configuration files...
-Copying and cleaning training logs...
-Training artifacts prepared in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/training_artifacts
-Contents:
-Log files:
-
-========================================
-STAGE 3: Uploading to HuggingFace Hub
-Repository: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-Start Time: Wed Oct 22 04:02:45 PM EDT 2025
-========================================
-Uploading contents of: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-Directory structure:
-
-Executing: huggingface-cli upload TAUR-dev/testing_llamafactory_helper_quick_test__interactive /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged .
-Start hashing 17 files.
-Finished hashing 17 files.
-[33m  Warning: 'huggingface-cli upload' is deprecated. Use 'hf upload' instead.[0m
-Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
-New Data Upload               : |          |  0.00B /  0.00B            [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  11%|         |  109MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  11%|         |  109MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  12%|        |  120MB / 1.00GB,   ???B/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  23%|       |  226MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  24%|       |  238MB / 1.00GB,  587MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  28%|       |  272MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  28%|       |  284MB / 1.00GB,  408MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  28%|       |  272MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  33%|      |  327MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  34%|      |  339MB / 1.00GB,  273MB/s  
-New Data Upload               :  41%|      | 55.1MB /  134MB, 68.8MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  38%|      |  379MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  39%|      |  390MB / 1.00GB,  270MB/s  
-New Data Upload               :  53%|    |  107MB /  201MB,  107MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  45%|     |  441MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  45%|     |  452MB / 1.00GB,  277MB/s  
-New Data Upload               :  63%|   |  169MB /  268MB,  141MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  51%|     |  506MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  52%|    |  517MB / 1.00GB,  284MB/s  
-New Data Upload               :  70%|   |  234MB /  335MB,  167MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  56%|    |  558MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  57%|    |  569MB / 1.00GB,  280MB/s  
-New Data Upload               :  85%| |  285MB /  335MB,  178MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  63%|   |  622MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  63%|   |  634MB / 1.00GB,  285MB/s  
-New Data Upload               :  87%| |  350MB /  402MB,  195MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  69%|   |  678MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  69%|   |  689MB / 1.00GB,  284MB/s  
-New Data Upload               :  86%| |  406MB /  469MB,  203MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  75%|  |  741MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  75%|  |  752MB / 1.00GB,  287MB/s  
-New Data Upload               :  87%| |  469MB /  537MB,  213MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  80%|  |  790MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  80%|  |  802MB / 1.00GB,  284MB/s  
-New Data Upload               :  86%| |  518MB /  604MB,  216MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  86%| |  848MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  86%| |  859MB / 1.00GB,  284MB/s  
-New Data Upload               :  86%| |  575MB /  671MB,  221MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  93%||  922MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  93%||  933MB / 1.00GB,  290MB/s  
-New Data Upload               :  91%| |  650MB /  716MB,  232MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  987MB /  988MB            [A[A[AProcessing Files (1 / 2)      : 100%||  999MB / 1.00GB,  293MB/s  
-New Data Upload               : 100%||  715MB /  716MB,  238MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  987MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (1 / 2)      : 100%||  999MB / 1.00GB,  258MB/s  
-New Data Upload               : 100%||  715MB /  716MB,  210MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (1 / 2)      : 100%||  999MB / 1.00GB,  244MB/s  
-New Data Upload               : 100%||  716MB /  716MB,  199MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (2 / 2)      : 100%|| 1.00GB / 1.00GB,  231MB/s  
-New Data Upload               : 100%||  716MB /  716MB,  188MB/s  [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (2 / 2)      : 100%|| 1.00GB / 1.00GB,  209MB/s  
-New Data Upload               : 100%||  716MB /  716MB,  170MB/s  
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            
-  .../merged/model.safetensors: 100%||  988MB /  988MB            
-Removing 10 file(s) from commit that have not changed.
-https://huggingface.co/TAUR-dev/testing_llamafactory_helper_quick_test__interactive/tree/main/.
-
-========================================
-Upload completed successfully
-Model and training artifacts uploaded to: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-End Time: Wed Oct 22 04:02:53 PM EDT 2025
-========================================
-
-========================================
-STAGE 4: Cleanup
-========================================
-Keeping checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Keeping merged model in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-
-========================================
-PIPELINE COMPLETED SUCCESSFULLY
-End Time: Wed Oct 22 04:02:53 PM EDT 2025
-========================================
-
-========================================
-Cleaning up LlamaFactory processes
-========================================
-Cleaned up processes on gl064.hpc.nyu.edu
-Cleaning up processes on worker node: gl065
-Process cleanup complete
-========================================
-Job Name: lf_torch_test__interactive
-Hostname: gl064.hpc.nyu.edu
-Number of nodes: 2
-GPUs per node: 2
-Start Time: Wed Oct 22 04:04:47 PM EDT 2025
-Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
-========================================
-Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
-
-========================================
-Configuration Paths
-========================================
-Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-Dataset Info: 
-Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-
-
-========================================
-Multi-Node Coordination
-========================================
-This is the master node - coordinating worker nodes...
-Master node: gl064
-Master port: 29500
-World size: 2
-
-Launching on worker node 1: gl065
-All worker nodes launched successfully
-Master node (this node) will now join training as rank 0
-
-
-========================================
-STAGE 1: Training Model
-Start Time: Wed Oct 22 04:04:50 PM EDT 2025
-========================================
-Multi-node training detected
-Nodes: 2, GPUs per node: 2
-Master address: gl064
-Master port: 29500
-Node rank: 0
-World size: 2
-CUDA_VISIBLE_DEVICES: 0,1
-LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
-Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-
-Starting distributed training with torch.distributed.run...
-
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-==================================
-Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
-
-========================================
-Configuration Paths
-========================================
-Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-Dataset Info: 
-Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-
-
-========================================
-STAGE 1: Training Model
-Start Time: Wed Oct 22 04:04:54 PM EDT 2025
-========================================
-Multi-node training detected
-Nodes: 2, GPUs per node: 2
-Master address: gl064
-Master port: 29500
-Node rank: 1
-World size: 2
-CUDA_VISIBLE_DEVICES: 0,1
-LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
-Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-
-Starting distributed training with torch.distributed.run...
-
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-[INFO|2025-10-22 16:05:06] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
-[INFO|2025-10-22 16:05:06] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
-[INFO|2025-10-22 16:05:06] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:06,834 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:05:07,005 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:07,230 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:07,232 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:07,297 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:05:07,463 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|2025-10-22 16:05:07] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
-[rank2]:[W1022 16:05:08.713448141 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
-gl065:3755181:3755181 [0] NCCL INFO cudaDriverVersion 13000
-gl065:3755181:3755181 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3755182:3755182 [1] NCCL INFO cudaDriverVersion 13000
-gl065:3755181:3755181 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
-gl065:3755181:3755181 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl065:3755182:3755182 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3755182:3755182 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
-gl065:3755182:3755182 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl065:3755181:3755181 [0] NCCL INFO Comm config Blocking set to 1
-gl065:3755182:3755182 [1] NCCL INFO Comm config Blocking set to 1
-gl065:3755181:3755317 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl065:3755181:3755317 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl065:3755182:3755318 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl065:3755182:3755318 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl065:3755181:3755317 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3755181:3755317 [0] NCCL INFO NCCL_IB_HCA set to mlx5
-gl065:3755182:3755318 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3755182:3755318 [1] NCCL INFO NCCL_IB_HCA set to mlx5
-gl065:3755181:3755317 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
-gl065:3755182:3755318 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
-gl065:3755181:3755317 [0] NCCL INFO Initialized NET plugin IB
-gl065:3755182:3755318 [1] NCCL INFO Initialized NET plugin IB
-gl065:3755182:3755318 [1] NCCL INFO Assigned NET plugin IB to comm
-gl065:3755181:3755317 [0] NCCL INFO Assigned NET plugin IB to comm
-gl065:3755182:3755318 [1] NCCL INFO Using network IB
-gl065:3755181:3755317 [0] NCCL INFO Using network IB
-gl065:3755182:3755318 [1] NCCL INFO ncclCommInitRankConfig comm 0x167c6630 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x781b8128280447d8 - Init START
-gl065:3755181:3755317 [0] NCCL INFO ncclCommInitRankConfig comm 0x12f3d8e0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x781b8128280447d8 - Init START
-gl065:3755182:3755318 [1] NCCL INFO RAS client listening socket at ::1<28028>
-gl065:3755181:3755317 [0] NCCL INFO RAS client listening socket at ::1<28028>
-gl065:3755182:3755318 [1] NCCL INFO Bootstrap timings total 0.003305 (create 0.000026, send 0.000631, recv 0.001224, ring 0.000366, delay 0.000000)
-gl065:3755181:3755317 [0] NCCL INFO Bootstrap timings total 0.015078 (create 0.000023, send 0.000621, recv 0.001675, ring 0.000753, delay 0.000000)
-gl065:3755181:3755317 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
-gl065:3755182:3755318 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
-gl065:3755181:3755317 [0] NCCL INFO comm 0x12f3d8e0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
-gl065:3755182:3755318 [1] NCCL INFO comm 0x167c6630 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
-gl065:3755181:3755317 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
-gl065:3755182:3755318 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
-gl065:3755181:3755317 [0] NCCL INFO P2P Chunksize set to 131072
-gl065:3755182:3755318 [1] NCCL INFO P2P Chunksize set to 131072
-gl065:3755182:3755318 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl065:3755181:3755317 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl065:3755182:3755323 [1] NCCL INFO [Proxy Service] Device 1 CPU core 6
-gl065:3755182:3755325 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 6
-gl065:3755181:3755324 [0] NCCL INFO [Proxy Service] Device 0 CPU core 10
-gl065:3755181:3755326 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 11
-gl065:3755182:3755318 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl065:3755182:3755318 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl065:3755181:3755317 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl065:3755181:3755317 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl065:3755181:3755317 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl065:3755182:3755318 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl065:3755181:3755317 [0] NCCL INFO ncclCommInitRankConfig comm 0x12f3d8e0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x781b8128280447d8 - Init COMPLETE
-gl065:3755182:3755318 [1] NCCL INFO ncclCommInitRankConfig comm 0x167c6630 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x781b8128280447d8 - Init COMPLETE
-gl065:3755181:3755317 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.13 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl065:3755182:3755318 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.13 (kernels 0.09, alloc 0.01, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl065:3755181:3755327 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/IB/0
-gl065:3755181:3755327 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/IB/0
-gl065:3755181:3755329 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 14
-gl065:3755181:3755327 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
-gl065:3755181:3755327 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
-gl065:3755182:3755328 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/IB/0
-gl065:3755182:3755328 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/IB/0
-gl065:3755182:3755330 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 15
-gl065:3755182:3755328 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-gl065:3755181:3755327 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-training example:
-input_ids:
-[33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
-inputs:
-Human: Answer the following problem. Explain your reasoning step by step. When you are finished, give your answer in this format: <answer>(your answer)</answer>.
-
-# Problem
-Using the numbers in the list [67, 71, 31], create an equation that equals 169. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Your solution should include a series of steps "Step X:" where each step is a mathematical operation and the final step ultimately leads to the target number or it should be a single equation that results in the target.
-
-Give your answer in the following format:
-<answer>
-(your answer)
-</answer>
-
-Where "(your answer)" is the list of steps to reach the target number or it should be a single equation that results in the target. 
-
-For example:
-If the list of numbers was [1, 2, 3] and the target was 1, you could write:
-<answer>
-Step 1: 1 + 2 = 3
-Step 2: 3 / 3 = 1
-</answer>
-
-or 
-
-<answer>
-(1 + 2) / 3
-</answer>
-
-Let's think step by step.<|endoftext|>
-Assistant:<think>
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
-Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
-Step 3: Try different combinations of these operations with the given numbers.
-Step 4: Test various equations until we find one that equals 169.
-Step 5: Once we have found the correct equation, present it in the specified format.
-After going through all these steps, here's the final equation that equals 169:
-
-<answer>
-(67 * 2) + 71 - 31
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Let me verify my answer.
-<sample>
-Rewritten Question:
-
-Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
-
-Step 1: Multiply 67 and 31.
-Step 2: Add 71 to the result from Step 1.
-Step 3: The final result will be 169.
-
-<answer>
-(67 * 31) + 71 = 169
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Time to review my response one more time.
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
-I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /). 
-Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
-After finding the correct combination, I will present my solution in the "<answer>
-(your answer)
-</answer>" format.
-Here's my detailed plan:
-
-1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
-2. Evaluate each combination to see if it equals 169.
-3. If no combination equals 169, repeat the process with new combinations until I find one that works.
-4. Once I have found the correct combination, express it as a single equation in the "<answer>
-(your answer)
-</answer>" format.
-
-Now let me begin searching for the correct combination!
-</sample>
-<reflect>
-Let us verify this answer:
-
-1. Step 1: 67 * 31 = 2077
-2. Step 2: 2077 - 71 = 2006
-
-Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-But wait, let me think about it again.
-<sample>
-Rewritten Question:
-
-What is the result when you add two numbers from the list [67, 71, 31] together?
-
-Step 1: Add 67 and 31.
-Step 2: Take the sum from Step 1 and add 71.
-
-Final Equation: (67 + 31) + 71
-
-<answer>
-(67 + 31) + 71
-</answer>
-</sample>
-<reflect>
-Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
-
-<verdict>
-Correct
-</verdict>
-</reflect>
-</think>
-
-
-Hence, the correct answer is:
-
-<answer>
-(67 + 31) + 71
-</answer><|endoftext|>
-
-label_ids:
-[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
-labels:
-<think>
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
-Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
-Step 3: Try different combinations of these operations with the given numbers.
-Step 4: Test various equations until we find one that equals 169.
-Step 5: Once we have found the correct equation, present it in the specified format.
-After going through all these steps, here's the final equation that equals 169:
-
-<answer>
-(67 * 2) + 71 - 31
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Let me verify my answer.
-<sample>
-Rewritten Question:
-
-Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
-
-Step 1: Multiply 67 and 31.
-Step 2: Add 71 to the result from Step 1.
-Step 3: The final result will be 169.
-
-<answer>
-(67 * 31) + 71 = 169
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Time to review my response one more time.
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
-I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /). 
-Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
-After finding the correct combination, I will present my solution in the "<answer>
-(your answer)
-</answer>" format.
-Here's my detailed plan:
-
-1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
-2. Evaluate each combination to see if it equals 169.
-3. If no combination equals 169, repeat the process with new combinations until I find one that works.
-4. Once I have found the correct combination, express it as a single equation in the "<answer>
-(your answer)
-</answer>" format.
-
-Now let me begin searching for the correct combination!
-</sample>
-<reflect>
-Let us verify this answer:
-
-1. Step 1: 67 * 31 = 2077
-2. Step 2: 2077 - 71 = 2006
-
-Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-But wait, let me think about it again.
-<sample>
-Rewritten Question:
-
-What is the result when you add two numbers from the list [67, 71, 31] together?
-
-Step 1: Add 67 and 31.
-Step 2: Take the sum from Step 1 and add 71.
-
-Final Equation: (67 + 31) + 71
-
-<answer>
-(67 + 31) + 71
-</answer>
-</sample>
-<reflect>
-Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
-
-<verdict>
-Correct
-</verdict>
-</reflect>
-</think>
-
-
-Hence, the correct answer is:
-
-<answer>
-(67 + 31) + 71
-</answer><|endoftext|>
-
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:10,670 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:10,670 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|2025-10-22 16:05:10] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
-[WARNING|logging.py:328] 2025-10-22 16:05:11,004 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:05:11,005 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:05:11,006 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:05:11,007 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "use_cache": false
-}
-
-`torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|configuration_utils.py:941] 2025-10-22 16:05:11,298 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:05:11,298 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048
-}
-
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:05:11,332 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:05:11] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
-[INFO|2025-10-22 16:05:11] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:05:11] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
-[INFO|2025-10-22 16:05:11] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
-[INFO|2025-10-22 16:05:11] llamafactory.model.model_utils.misc:143 >> Found linear modules: up_proj,gate_proj,q_proj,o_proj,k_proj,v_proj,down_proj
-[INFO|2025-10-22 16:05:11] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
-[WARNING|trainer.py:906] 2025-10-22 16:05:11,575 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
-[INFO|trainer.py:699] 2025-10-22 16:05:11,577 >> max_steps is given, it will override any value given in num_train_epochs
-[INFO|trainer.py:749] 2025-10-22 16:05:11,577 >> Using auto half precision backend
-[WARNING|trainer.py:982] 2025-10-22 16:05:11,578 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-The model is already on multiple devices. Skipping the move to device specified in `args`.
-The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-[INFO|trainer.py:2519] 2025-10-22 16:05:12,166 >> ***** Running training *****
-[INFO|trainer.py:2520] 2025-10-22 16:05:12,166 >>   Num examples = 48,600
-[INFO|trainer.py:2521] 2025-10-22 16:05:12,166 >>   Num Epochs = 1
-[INFO|trainer.py:2522] 2025-10-22 16:05:12,166 >>   Instantaneous batch size per device = 1
-[INFO|trainer.py:2525] 2025-10-22 16:05:12,166 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:05:12,166 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:05:12,166 >>   Total optimization steps = 100
-[INFO|trainer.py:2528] 2025-10-22 16:05:12,167 >>   Number of trainable parameters = 4,399,104
-evice = 1
-[INFO|trainer.py:2525] 2025-10-22 16:05:12,165 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:05:12,165 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:05:12,165 >>   Total optimization steps = 100
-[INFO|trainer.py:2528] 2025-10-22 16:05:12,167 >>   Number of trainable parameters = 4,399,104
-[INFO|integration_utils.py:867] 2025-10-22 16:05:12,178 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
-wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
-wandb: Tracking run with wandb version 0.22.2
-wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_160512-dppinxzz
-wandb: Run `wandb offline` to turn off syncing.
-wandb: Syncing run interactive_test
-wandb:  View project at https://wandb.ai/ut_nlp_deduce/llamafactory
-wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/dppinxzz
-  0%|          | 0/100 [00:00<?, ?it/s]  1%|          | 1/100 [00:00<00:59,  1.66it/s]  2%|         | 2/100 [00:00<00:34,  2.87it/s]  3%|         | 3/100 [00:00<00:25,  3.81it/s]  4%|         | 4/100 [00:01<00:23,  4.04it/s]  5%|         | 5/100 [00:01<00:34,  2.74it/s]  6%|         | 6/100 [00:01<00:30,  3.07it/s]  7%|         | 7/100 [00:02<00:28,  3.31it/s]  8%|         | 8/100 [00:02<00:24,  3.68it/s]  9%|         | 9/100 [00:02<00:23,  3.95it/s] 10%|         | 10/100 [00:02<00:20,  4.34it/s]                                                {'loss': 0.8095, 'grad_norm': 0.4080905020236969, 'learning_rate': 4.55e-05, 'epoch': 0.0}
- 10%|         | 10/100 [00:02<00:20,  4.34it/s] 11%|         | 11/100 [00:03<00:20,  4.41it/s] 12%|        | 12/100 [00:03<00:26,  3.30it/s] 13%|        | 13/100 [00:03<00:22,  3.89it/s] 14%|        | 14/100 [00:03<00:19,  4.43it/s] 15%|        | 15/100 [00:04<00:17,  4.78it/s] 16%|        | 16/100 [00:04<00:23,  3.63it/s] 17%|        | 17/100 [00:04<00:22,  3.65it/s] 18%|        | 18/100 [00:04<00:22,  3.71it/s] 19%|        | 19/100 [00:05<00:21,  3.70it/s] 20%|        | 20/100 [00:05<00:20,  3.98it/s]                                                {'loss': 0.7526, 'grad_norm': 0.3977513313293457, 'learning_rate': 4.05e-05, 'epoch': 0.0}
- 20%|        | 20/100 [00:05<00:20,  3.98it/s] 21%|        | 21/100 [00:05<00:20,  3.94it/s] 22%|       | 22/100 [00:05<00:18,  4.18it/s] 23%|       | 23/100 [00:06<00:18,  4.07it/s] 24%|       | 24/100 [00:06<00:17,  4.40it/s] 25%|       | 25/100 [00:06<00:18,  4.02it/s] 26%|       | 26/100 [00:06<00:17,  4.17it/s] 27%|       | 27/100 [00:07<00:17,  4.09it/s] 28%|       | 28/100 [00:07<00:19,  3.61it/s] 29%|       | 29/100 [00:07<00:19,  3.72it/s] 30%|       | 30/100 [00:07<00:18,  3.83it/s]                                                {'loss': 0.7384, 'grad_norm': 0.4652378559112549, 'learning_rate': 3.55e-05, 'epoch': 0.0}
- 30%|       | 30/100 [00:07<00:18,  3.83it/s] 31%|       | 31/100 [00:08<00:18,  3.82it/s] 32%|      | 32/100 [00:08<00:16,  4.04it/s] 33%|      | 33/100 [00:08<00:15,  4.44it/s] 34%|      | 34/100 [00:08<00:13,  4.88it/s] 35%|      | 35/100 [00:09<00:14,  4.58it/s] 36%|      | 36/100 [00:09<00:12,  5.04it/s] 37%|      | 37/100 [00:09<00:13,  4.71it/s] 38%|      | 38/100 [00:09<00:13,  4.67it/s] 39%|      | 39/100 [00:09<00:12,  4.98it/s] 40%|      | 40/100 [00:10<00:13,  4.59it/s]                                                {'loss': 0.7139, 'grad_norm': 0.37474513053894043, 'learning_rate': 3.05e-05, 'epoch': 0.0}
- 40%|      | 40/100 [00:10<00:13,  4.59it/s] 41%|      | 41/100 [00:10<00:14,  4.10it/s] 42%|     | 42/100 [00:10<00:14,  4.07it/s] 43%|     | 43/100 [00:10<00:12,  4.58it/s] 44%|     | 44/100 [00:10<00:11,  4.90it/s] 45%|     | 45/100 [00:11<00:10,  5.33it/s] 46%|     | 46/100 [00:11<00:10,  4.97it/s] 47%|     | 47/100 [00:11<00:10,  4.88it/s] 48%|     | 48/100 [00:11<00:10,  5.13it/s][INFO|trainer.py:2810] 2025-10-22 16:05:37,238 >> 
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-gl065:3755182:3755182 [1] NCCL INFO comm 0x167c6630 rank 3 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl065:3755181:3755181 [0] NCCL INFO comm 0x12f3d8e0 rank 2 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
- 2025-10-22 16:05:25,446 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:25,628 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:25,629 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:05:25,782 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:05:25,804 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:05:25,809 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
- 51%|     | 51/100 [00:13<00:24,  2.01it/s] 52%|    | 52/100 [00:13<00:20,  2.30it/s] 53%|    | 53/100 [00:13<00:17,  2.66it/s] 54%|    | 54/100 [00:14<00:14,  3.23it/s] 55%|    | 55/100 [00:14<00:13,  3.33it/s] 56%|    | 56/100 [00:14<00:11,  3.94it/s] 57%|    | 57/100 [00:14<00:10,  4.02it/s] 58%|    | 58/100 [00:14<00:09,  4.43it/s] 59%|    | 59/100 [00:15<00:08,  5.12it/s] 60%|    | 60/100 [00:15<00:07,  5.52it/s]                                                {'loss': 0.6288, 'grad_norm': 0.4911618232727051, 'learning_rate': 2.05e-05, 'epoch': 0.0}
- 60%|    | 60/100 [00:15<00:07,  5.52it/s] 61%|    | 61/100 [00:15<00:07,  5.14it/s] 62%|   | 62/100 [00:15<00:06,  5.71it/s] 63%|   | 63/100 [00:15<00:07,  5.15it/s] 64%|   | 64/100 [00:15<00:06,  5.44it/s] 65%|   | 65/100 [00:16<00:07,  4.97it/s] 66%|   | 66/100 [00:16<00:07,  4.59it/s] 67%|   | 67/100 [00:16<00:06,  4.86it/s] 68%|   | 68/100 [00:16<00:07,  4.54it/s] 69%|   | 69/100 [00:17<00:07,  4.15it/s] 70%|   | 70/100 [00:17<00:07,  4.10it/s]                                                {'loss': 0.6135, 'grad_norm': 0.5213523507118225, 'learning_rate': 1.55e-05, 'epoch': 0.01}
- 70%|   | 70/100 [00:17<00:07,  4.10it/s] 71%|   | 71/100 [00:17<00:07,  3.97it/s] 72%|  | 72/100 [00:17<00:06,  4.49it/s] 73%|  | 73/100 [00:18<00:06,  4.00it/s] 74%|  | 74/100 [00:18<00:06,  4.19it/s] 75%|  | 75/100 [00:18<00:05,  4.70it/s] 76%|  | 76/100 [00:18<00:05,  4.73it/s] 77%|  | 77/100 [00:18<00:04,  5.24it/s] 78%|  | 78/100 [00:19<00:04,  4.69it/s] 79%|  | 79/100 [00:19<00:04,  4.55it/s] 80%|  | 80/100 [00:19<00:04,  4.27it/s]                                                {'loss': 0.6435, 'grad_norm': 0.4015622138977051, 'learning_rate': 1.05e-05, 'epoch': 0.01}
- 80%|  | 80/100 [00:19<00:04,  4.27it/s] 81%|  | 81/100 [00:19<00:04,  4.66it/s] 82%| | 82/100 [00:20<00:04,  4.06it/s] 83%| | 83/100 [00:20<00:03,  4.45it/s] 84%| | 84/100 [00:20<00:03,  4.40it/s] 85%| | 85/100 [00:20<00:03,  4.45it/s] 86%| | 86/100 [00:20<00:02,  5.05it/s] 87%| | 87/100 [00:21<00:02,  5.18it/s] 88%| | 88/100 [00:21<00:02,  4.87it/s] 89%| | 89/100 [00:21<00:02,  4.59it/s] 90%| | 90/100 [00:21<00:01,  5.20it/s]                                                {'loss': 0.6313, 'grad_norm': 0.5440771579742432, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.01}
- 90%| | 90/100 [00:21<00:01,  5.20it/s] 91%| | 91/100 [00:21<00:01,  4.63it/s] 92%|| 92/100 [00:22<00:01,  4.52it/s] 93%|| 93/100 [00:22<00:01,  4.74it/s] 94%|| 94/100 [00:22<00:01,  4.69it/s] 95%|| 95/100 [00:22<00:01,  4.78it/s] 96%|| 96/100 [00:23<00:00,  4.42it/s] 97%|| 97/100 [00:23<00:00,  3.84it/s] 98%|| 98/100 [00:23<00:00,  4.26it/s] 99%|| 99/100 [00:23<00:00,  4.53it/s]100%|| 100/100 [00:23<00:00,  4.30it/s]                                                 {'loss': 0.6241, 'grad_norm': 0.43957775831222534, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.01}
-100%|| 100/100 [00:23<00:00,  4.30it/s][INFO|trainer.py:4309] 2025-10-22 16:05:37,200 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:37,345 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:37,346 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:05:37,515 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:05:37,547 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:05:37,565 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
-[INFO|trainer.py:2810] 2025-10-22 16:05:38,041 >> 
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-                                                 {'train_runtime': 25.8748, 'train_samples_per_second': 15.459, 'train_steps_per_second': 3.865, 'train_loss': 0.6805182361602783, 'epoch': 0.01}
-100%|| 100/100 [00:24<00:00,  4.30it/s]100%|| 100/100 [00:24<00:00,  4.02it/s]
-[INFO|trainer.py:4309] 2025-10-22 16:05:38,051 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:38,140 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:38,141 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:05:38,274 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:05:38,280 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:05:38,283 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
-***** train metrics *****
-  epoch                    =     0.0082
-  total_flos               =  1473847GF
-  train_loss               =     0.6805
-  train_runtime            = 0:00:25.87
-  train_samples_per_second =     15.459
-  train_steps_per_second   =      3.865
-[INFO|modelcard.py:456] 2025-10-22 16:05:38,469 >> Dropping the following result as it does not have all the necessary fields:
-{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
-gl064:2370173:2370173 [1] NCCL INFO comm 0x16d00aa0 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl064:2370172:2370172 [0] NCCL INFO comm 0x14055870 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
-[1;34mwandb[0m: 
-[1;34mwandb[0m:  View run [33minteractive_test[0m at: [34m[0m
-[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_160512-dppinxzz/logs[0m
-
-========================================
-Training completed successfully
-End Time: Wed Oct 22 04:05:41 PM EDT 2025
-========================================
-
-========================================
-STAGE 2: Merging/Exporting Model
-Start Time: Wed Oct 22 04:05:41 PM EDT 2025
-========================================
-Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Found most recent checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-Checkpoint details:
-  Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-STAGE 2: Merging/Exporting Model
-:30.204175325 -0400
-Start Time: Wed Oct 22 04:05:41 PM EDT 2025
-========================================
-Skipping merge/export on worker node (rank 1)
-Skipping artifact preparation and upload on worker node (rank 1)
-Skipping cleanup on worker node (rank 1)
-
-========================================
-PIPELINE COMPLETED SUCCESSFULLY
-End Time: Wed Oct 22 04:05:41 PM EDT 2025
-========================================
-Successfully updated merge config
-=======
-Cleaning up LlamaFactory processes
-========================================
-Cleaned up processes on gl065.hpc.nyu.edu
-Process cleanup complete
-Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-
-Merge config contents:
-  model_name_or_path: Qwen/Qwen2.5-0.5B
-  finetuning_type: lora
-  trust_remote_code: true
-  adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-  template: default
-  export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-
-Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,842 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,842 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,842 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,842 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,843 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,843 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:49,843 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:05:50,013 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:50,242 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:50,244 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:05:50,336 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:05:50,499 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:05:50,543 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:05:50,544 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[WARNING|logging.py:328] 2025-10-22 16:05:50,544 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|2025-10-22 16:05:50] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
-[WARNING|logging.py:328] 2025-10-22 16:05:50,858 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:05:50,859 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:05:50,859 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:05:50,860 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643
-}
-
-[INFO|configuration_utils.py:941] 2025-10-22 16:05:50,951 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:05:50,952 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048
-}
-
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:05:50,980 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:05:50] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:05:52] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
-[INFO|2025-10-22 16:05:52] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-[INFO|2025-10-22 16:05:52] llamafactory.model.loader:143 >> all params: 494,032,768
-[INFO|2025-10-22 16:05:52] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
-[INFO|configuration_utils.py:491] 2025-10-22 16:05:52,089 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
-[INFO|configuration_utils.py:757] 2025-10-22 16:05:52,094 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
-[INFO|modeling_utils.py:4181] 2025-10-22 16:05:53,693 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:05:53,698 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:05:53,702 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:05:53,706 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
-[INFO|2025-10-22 16:05:53] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
-
-========================================
-Merge/Export completed successfully
-End Time: Wed Oct 22 04:05:54 PM EDT 2025
-========================================
-
-========================================
-Preparing Training Artifacts
-========================================
-Copying configuration files...
-Copying and cleaning training logs...
-Training artifacts prepared in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/training_artifacts
-Contents:
-Log files:
-
-========================================
-STAGE 3: Uploading to HuggingFace Hub
-Repository: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-Start Time: Wed Oct 22 04:05:54 PM EDT 2025
-========================================
-Uploading contents of: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-Directory structure:
-
-Executing: huggingface-cli upload TAUR-dev/testing_llamafactory_helper_quick_test__interactive /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged .
-Start hashing 17 files.
-Finished hashing 17 files.
-[33m  Warning: 'huggingface-cli upload' is deprecated. Use 'hf upload' instead.[0m
-Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
-New Data Upload               : |          |  0.00B /  0.00B            [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  11%|         |  109MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  11%|         |  109MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  12%|        |  120MB / 1.00GB,   ???B/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  23%|       |  226MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  24%|       |  238MB / 1.00GB,  587MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  36%|      |  352MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  36%|      |  364MB / 1.00GB,  607MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  41%|      |  403MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  41%|     |  414MB / 1.00GB,  489MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  46%|     |  453MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  46%|     |  464MB / 1.00GB,  429MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  59%|    |  579MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  59%|    |  590MB / 1.00GB,  469MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  71%|  |  705MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  72%|  |  716MB / 1.00GB,  496MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  83%| |  822MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  83%| |  833MB / 1.00GB,  509MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  96%||  948MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  96%||  959MB / 1.00GB,  524MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (2 / 2)      : 100%|| 1.00GB / 1.00GB,  488MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (2 / 2)      : 100%|| 1.00GB / 1.00GB,  400MB/s  
-New Data Upload               : |          |  0.00B /  0.00B,  0.00B/s  
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            
-  .../merged/model.safetensors: 100%||  988MB /  988MB            
-Removing 12 file(s) from commit that have not changed.
-https://huggingface.co/TAUR-dev/testing_llamafactory_helper_quick_test__interactive/tree/main/.
-
-========================================
-Upload completed successfully
-Model and training artifacts uploaded to: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-End Time: Wed Oct 22 04:06:00 PM EDT 2025
-========================================
-
-========================================
-STAGE 4: Cleanup
-========================================
-Keeping checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Keeping merged model in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-
-========================================
-PIPELINE COMPLETED SUCCESSFULLY
-End Time: Wed Oct 22 04:06:00 PM EDT 2025
-========================================
-
-========================================
-Cleaning up LlamaFactory processes
-========================================
-Cleaned up processes on gl064.hpc.nyu.edu
-Cleaning up processes on worker node: gl065
-Process cleanup complete
-========================================
-Job Name: lf_torch_test__interactive
-Hostname: gl064.hpc.nyu.edu
-Number of nodes: 2
-GPUs per node: 2
-Start Time: Wed Oct 22 04:08:15 PM EDT 2025
-Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
-========================================
-Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
-
-========================================
-Configuration Paths
-========================================
-Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-Dataset Info: 
-Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-
-
-========================================
-Multi-Node Coordination
-========================================
-This is the master node - coordinating worker nodes...
-Master node: gl064
-Master port: 29500
-World size: 2
-
-Launching on worker node 1: gl065
-All worker nodes launched successfully
-Master node (this node) will now join training as rank 0
-
-
-========================================
-STAGE 1: Training Model
-Start Time: Wed Oct 22 04:08:17 PM EDT 2025
-========================================
-Multi-node training detected
-Nodes: 2, GPUs per node: 2
-Master address: gl064
-Master port: 29500
-Node rank: 0
-World size: 2
-CUDA_VISIBLE_DEVICES: 0,1
-LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
-Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-
-Starting distributed training with torch.distributed.run...
-
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-==================================
-Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
-
-========================================
-Configuration Paths
-========================================
-Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-Dataset Info: 
-Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-
-
-========================================
-STAGE 1: Training Model
-Start Time: Wed Oct 22 04:08:22 PM EDT 2025
-========================================
-Multi-node training detected
-Nodes: 2, GPUs per node: 2
-Master address: gl064
-Master port: 29500
-Node rank: 1
-World size: 2
-CUDA_VISIBLE_DEVICES: 0,1
-LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
-Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
-
-Starting distributed training with torch.distributed.run...
-
-*****************************************
-Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-*****************************************
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
-[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
-[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:08:34,527 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:08:34,718 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:08:34,719 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:08:34,958 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|2025-10-22 16:08:34] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
-gl065:3757610:3757610 [1] NCCL INFO cudaDriverVersion 13000
-[rank2]:[W1022 16:08:35.111666688 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
-gl065:3757610:3757610 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3757610:3757610 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
-gl065:3757610:3757610 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl065:3757610:3757610 [1] NCCL INFO Comm config Blocking set to 1
-gl065:3757609:3757609 [0] NCCL INFO cudaDriverVersion 13000
-gl065:3757609:3757609 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3757609:3757609 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
-gl065:3757609:3757609 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl065:3757609:3757609 [0] NCCL INFO Comm config Blocking set to 1
-gl065:3757610:3757745 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl065:3757610:3757745 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl065:3757610:3757745 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3757610:3757745 [1] NCCL INFO NCCL_IB_HCA set to mlx5
-gl065:3757609:3757746 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl065:3757609:3757746 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl065:3757610:3757745 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
-gl065:3757610:3757745 [1] NCCL INFO Initialized NET plugin IB
-gl065:3757609:3757746 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl065:3757609:3757746 [0] NCCL INFO NCCL_IB_HCA set to mlx5
-gl065:3757610:3757745 [1] NCCL INFO Assigned NET plugin IB to comm
-gl065:3757610:3757745 [1] NCCL INFO Using network IB
-gl065:3757610:3757745 [1] NCCL INFO ncclCommInitRankConfig comm 0x15ee2290 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xf1d8741eaa5d2206 - Init START
-gl065:3757609:3757746 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
-gl065:3757609:3757746 [0] NCCL INFO Initialized NET plugin IB
-gl065:3757609:3757746 [0] NCCL INFO Assigned NET plugin IB to comm
-gl065:3757609:3757746 [0] NCCL INFO Using network IB
-gl065:3757609:3757746 [0] NCCL INFO ncclCommInitRankConfig comm 0x13a861a0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xf1d8741eaa5d2206 - Init START
-gl065:3757609:3757746 [0] NCCL INFO RAS client listening socket at ::1<28028>
-gl065:3757610:3757745 [1] NCCL INFO RAS client listening socket at ::1<28028>
-gl065:3757610:3757745 [1] NCCL INFO Bootstrap timings total 0.014799 (create 0.000048, send 0.000601, recv 0.001295, ring 0.005397, delay 0.000000)
-gl065:3757609:3757746 [0] NCCL INFO Bootstrap timings total 0.018568 (create 0.000023, send 0.000388, recv 0.000964, ring 0.005735, delay 0.000000)
-gl065:3757609:3757746 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
-gl065:3757610:3757745 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
-gl065:3757610:3757745 [1] NCCL INFO comm 0x15ee2290 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
-gl065:3757609:3757746 [0] NCCL INFO comm 0x13a861a0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
-gl065:3757610:3757745 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
-gl065:3757610:3757745 [1] NCCL INFO P2P Chunksize set to 131072
-gl065:3757609:3757746 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
-gl065:3757609:3757746 [0] NCCL INFO P2P Chunksize set to 131072
-gl065:3757610:3757745 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl065:3757609:3757746 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl065:3757610:3757751 [1] NCCL INFO [Proxy Service] Device 1 CPU core 11
-gl065:3757610:3757753 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 13
-gl065:3757609:3757754 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 14
-gl065:3757609:3757752 [0] NCCL INFO [Proxy Service] Device 0 CPU core 12
-gl065:3757609:3757746 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl065:3757609:3757746 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl065:3757610:3757745 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl065:3757610:3757745 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl065:3757609:3757746 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl065:3757609:3757746 [0] NCCL INFO ncclCommInitRankConfig comm 0x13a861a0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xf1d8741eaa5d2206 - Init COMPLETE
-gl065:3757610:3757745 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl065:3757609:3757746 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.14 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl065:3757610:3757745 [1] NCCL INFO ncclCommInitRankConfig comm 0x15ee2290 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xf1d8741eaa5d2206 - Init COMPLETE
-gl065:3757610:3757745 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.15 (kernels 0.09, alloc 0.01, bootstrap 0.01, allgathers 0.02, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl065:3757609:3757756 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/IB/0
-gl065:3757609:3757756 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/IB/0
-gl065:3757609:3757757 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 9
-gl065:3757609:3757756 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
-gl065:3757609:3757756 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
-gl065:3757610:3757755 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/IB/0
-gl065:3757610:3757755 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/IB/0
-gl065:3757610:3757758 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 2
-gl065:3757610:3757755 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-gl065:3757609:3757756 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-training example:
-input_ids:
-[33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
-inputs:
-Human: Answer the following problem. Explain your reasoning step by step. When you are finished, give your answer in this format: <answer>(your answer)</answer>.
-
-# Problem
-Using the numbers in the list [67, 71, 31], create an equation that equals 169. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Your solution should include a series of steps "Step X:" where each step is a mathematical operation and the final step ultimately leads to the target number or it should be a single equation that results in the target.
-
-Give your answer in the following format:
-<answer>
-(your answer)
-</answer>
-
-Where "(your answer)" is the list of steps to reach the target number or it should be a single equation that results in the target. 
-
-For example:
-If the list of numbers was [1, 2, 3] and the target was 1, you could write:
-<answer>
-Step 1: 1 + 2 = 3
-Step 2: 3 / 3 = 1
-</answer>
-
-or 
-
-<answer>
-(1 + 2) / 3
-</answer>
-
-Let's think step by step.<|endoftext|>
-Assistant:<think>
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
-Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
-Step 3: Try different combinations of these operations with the given numbers.
-Step 4: Test various equations until we find one that equals 169.
-Step 5: Once we have found the correct equation, present it in the specified format.
-After going through all these steps, here's the final equation that equals 169:
-
-<answer>
-(67 * 2) + 71 - 31
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Let me verify my answer.
-<sample>
-Rewritten Question:
-
-Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
-
-Step 1: Multiply 67 and 31.
-Step 2: Add 71 to the result from Step 1.
-Step 3: The final result will be 169.
-
-<answer>
-(67 * 31) + 71 = 169
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Time to review my response one more time.
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
-I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /). 
-Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
-After finding the correct combination, I will present my solution in the "<answer>
-(your answer)
-</answer>" format.
-Here's my detailed plan:
-
-1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
-2. Evaluate each combination to see if it equals 169.
-3. If no combination equals 169, repeat the process with new combinations until I find one that works.
-4. Once I have found the correct combination, express it as a single equation in the "<answer>
-(your answer)
-</answer>" format.
-
-Now let me begin searching for the correct combination!
-</sample>
-<reflect>
-Let us verify this answer:
-
-1. Step 1: 67 * 31 = 2077
-2. Step 2: 2077 - 71 = 2006
-
-Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-But wait, let me think about it again.
-<sample>
-Rewritten Question:
-
-What is the result when you add two numbers from the list [67, 71, 31] together?
-
-Step 1: Add 67 and 31.
-Step 2: Take the sum from Step 1 and add 71.
-
-Final Equation: (67 + 31) + 71
-
-<answer>
-(67 + 31) + 71
-</answer>
-</sample>
-<reflect>
-Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
-
-<verdict>
-Correct
-</verdict>
-</reflect>
-</think>
-
-
-Hence, the correct answer is:
-
-<answer>
-(67 + 31) + 71
-</answer><|endoftext|>
-
-label_ids:
-[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
-labels:
-<think>
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
-Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
-Step 3: Try different combinations of these operations with the given numbers.
-Step 4: Test various equations until we find one that equals 169.
-Step 5: Once we have found the correct equation, present it in the specified format.
-After going through all these steps, here's the final equation that equals 169:
-
-<answer>
-(67 * 2) + 71 - 31
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Let me verify my answer.
-<sample>
-Rewritten Question:
-
-Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
-
-Step 1: Multiply 67 and 31.
-Step 2: Add 71 to the result from Step 1.
-Step 3: The final result will be 169.
-
-<answer>
-(67 * 31) + 71 = 169
-</answer>
-</sample>
-<reflect>
-Let us verify this answer:
-
-First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-Time to review my response one more time.
-<sample>
-First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
-The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
-I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /). 
-Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
-After finding the correct combination, I will present my solution in the "<answer>
-(your answer)
-</answer>" format.
-Here's my detailed plan:
-
-1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
-2. Evaluate each combination to see if it equals 169.
-3. If no combination equals 169, repeat the process with new combinations until I find one that works.
-4. Once I have found the correct combination, express it as a single equation in the "<answer>
-(your answer)
-</answer>" format.
-
-Now let me begin searching for the correct combination!
-</sample>
-<reflect>
-Let us verify this answer:
-
-1. Step 1: 67 * 31 = 2077
-2. Step 2: 2077 - 71 = 2006
-
-Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
-
-<verdict>
-Incorrect
-</verdict>
-</reflect>
-But wait, let me think about it again.
-<sample>
-Rewritten Question:
-
-What is the result when you add two numbers from the list [67, 71, 31] together?
-
-Step 1: Add 67 and 31.
-Step 2: Take the sum from Step 1 and add 71.
-
-Final Equation: (67 + 31) + 71
-
-<answer>
-(67 + 31) + 71
-</answer>
-</sample>
-<reflect>
-Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
-
-<verdict>
-Correct
-</verdict>
-</reflect>
-</think>
-
-
-Hence, the correct answer is:
-
-<answer>
-(67 + 31) + 71
-</answer><|endoftext|>
-
-[INFO|configuration_utils.py:765] 2025-10-22 16:08:36,172 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:08:36,173 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
-[WARNING|logging.py:328] 2025-10-22 16:08:36,500 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:08:36,502 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:08:36,502 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:08:36,503 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "use_cache": false
-}
-
-`torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|configuration_utils.py:941] 2025-10-22 16:08:36,753 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:08:36,754 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048
-}
-
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:08:36,785 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
-[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:08:36] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
-[INFO|2025-10-22 16:08:36] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
-[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.misc:143 >> Found linear modules: o_proj,down_proj,v_proj,up_proj,gate_proj,q_proj,k_proj
-[INFO|2025-10-22 16:08:37] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
-[WARNING|trainer.py:906] 2025-10-22 16:08:37,029 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
-[INFO|trainer.py:699] 2025-10-22 16:08:37,031 >> max_steps is given, it will override any value given in num_train_epochs
-[INFO|trainer.py:749] 2025-10-22 16:08:37,031 >> Using auto half precision backend
-[WARNING|trainer.py:982] 2025-10-22 16:08:37,032 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-The model is already on multiple devices. Skipping the move to device specified in `args`.
-The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-[INFO|trainer.py:2519] 2025-10-22 16:08:37,580 >> ***** Running training *****
-[INFO|trainer.py:2520] 2025-10-22 16:08:37,580 >>   Num examples = 48,600
-[INFO|trainer.py:2521] 2025-10-22 16:08:37,580 >>   Num Epochs = 1
-[INFO|trainer.py:2522] 2025-10-22 16:08:37,581 >>   Instantaneous batch size per device = 1
-[INFO|trainer.py:2525] 2025-10-22 16:08:37,581 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:08:37,581 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:08:37,581 >>   Total optimization steps = 100
-[INFO|trainer.py:2528] 2025-10-22 16:08:37,582 >>   Number of trainable parameters = 4,399,104
-vice = 1
-[INFO|trainer.py:2525] 2025-10-22 16:08:37,579 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:08:37,579 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:08:37,579 >>   Total optimization steps = 100
-[INFO|trainer.py:2528] 2025-10-22 16:08:37,581 >>   Number of trainable parameters = 4,399,104
-[INFO|integration_utils.py:867] 2025-10-22 16:08:37,603 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
-wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
-wandb: Tracking run with wandb version 0.22.2
-wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_160837-4de4rspj
-wandb: Run `wandb offline` to turn off syncing.
-wandb: Syncing run interactive_test
-wandb:  View project at https://wandb.ai/ut_nlp_deduce/llamafactory
-wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/4de4rspj
-  0%|          | 0/100 [00:00<?, ?it/s]  1%|          | 1/100 [00:00<00:51,  1.91it/s]  2%|         | 2/100 [00:00<00:30,  3.18it/s]  3%|         | 3/100 [00:00<00:23,  4.08it/s]  4%|         | 4/100 [00:01<00:22,  4.21it/s]  5%|         | 5/100 [00:01<00:30,  3.13it/s]  6%|         | 6/100 [00:01<00:27,  3.38it/s]  7%|         | 7/100 [00:02<00:26,  3.55it/s]  8%|         | 8/100 [00:02<00:23,  3.87it/s]  9%|         | 9/100 [00:02<00:22,  4.10it/s] 10%|         | 10/100 [00:02<00:20,  4.47it/s]                                                {'loss': 0.8094, 'grad_norm': 0.4079560935497284, 'learning_rate': 4.55e-05, 'epoch': 0.0}
- 10%|         | 10/100 [00:02<00:20,  4.47it/s] 11%|         | 11/100 [00:02<00:19,  4.50it/s] 12%|        | 12/100 [00:03<00:26,  3.33it/s] 13%|        | 13/100 [00:03<00:22,  3.92it/s] 14%|        | 14/100 [00:03<00:19,  4.46it/s] 15%|        | 15/100 [00:03<00:17,  4.80it/s] 16%|        | 16/100 [00:04<00:23,  3.60it/s] 17%|        | 17/100 [00:04<00:22,  3.63it/s] 18%|        | 18/100 [00:04<00:22,  3.69it/s] 19%|        | 19/100 [00:05<00:22,  3.68it/s] 20%|        | 20/100 [00:05<00:20,  3.97it/s]                                                {'loss': 0.7526, 'grad_norm': 0.3976583480834961, 'learning_rate': 4.05e-05, 'epoch': 0.0}
- 20%|        | 20/100 [00:05<00:20,  3.97it/s] 21%|        | 21/100 [00:05<00:20,  3.93it/s] 22%|       | 22/100 [00:05<00:18,  4.17it/s] 23%|       | 23/100 [00:05<00:18,  4.05it/s] 24%|       | 24/100 [00:06<00:17,  4.38it/s] 25%|       | 25/100 [00:06<00:18,  4.01it/s] 26%|       | 26/100 [00:06<00:17,  4.16it/s] 27%|       | 27/100 [00:06<00:17,  4.08it/s] 28%|       | 28/100 [00:07<00:19,  3.65it/s] 29%|       | 29/100 [00:07<00:18,  3.75it/s] 30%|       | 30/100 [00:07<00:18,  3.85it/s]                                                {'loss': 0.7383, 'grad_norm': 0.465567946434021, 'learning_rate': 3.55e-05, 'epoch': 0.0}
- 30%|       | 30/100 [00:07<00:18,  3.85it/s] 31%|       | 31/100 [00:08<00:18,  3.83it/s] 32%|      | 32/100 [00:08<00:16,  4.05it/s] 33%|      | 33/100 [00:08<00:15,  4.44it/s] 34%|      | 34/100 [00:08<00:13,  4.87it/s] 35%|      | 35/100 [00:08<00:14,  4.57it/s] 36%|      | 36/100 [00:09<00:12,  5.02it/s] 37%|      | 37/100 [00:09<00:13,  4.69it/s] 38%|      | 38/100 [00:09<00:13,  4.65it/s] 39%|      | 39/100 [00:09<00:12,  4.97it/s] 40%|      | 40/100 [00:09<00:13,  4.57it/s]                                                {'loss': 0.7139, 'grad_norm': 0.3747170865535736, 'learning_rate': 3.05e-05, 'epoch': 0.0}
- 40%|      | 40/100 [00:09<00:13,  4.57it/s] 41%|      | 41/100 [00:10<00:14,  4.09it/s] 42%|     | 42/100 [00:10<00:14,  4.07it/s] 43%|     | 43/100 [00:10<00:12,  4.57it/s] 44%|     | 44/100 [00:10<00:11,  4.89it/s] 45%|     | 45/100 [00:10<00:10,  5.31it/s] 46%|     | 46/100 [00:11<00:10,  4.97it/s] 47%|     | 47/100 [00:11<00:10,  4.87it/s] 48%|     | 48/100 [00:11<00:10,  5.13it/s][INFO|trainer.py:2810] 2025-10-22 16:09:02,748 >> 
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-gl065:3757610:3757610 [1] NCCL INFO comm 0x15ee2290 rank 3 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl065:3757609:3757609 [0] NCCL INFO comm 0x13a861a0 rank 2 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
- 2025-10-22 16:08:50,960 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
-[INFO|configuration_utils.py:765] 2025-10-22 16:08:51,135 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:08:51,137 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:08:51,309 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:08:51,329 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:08:51,333 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
- 51%|     | 51/100 [00:13<00:24,  2.02it/s] 52%|    | 52/100 [00:13<00:20,  2.31it/s] 53%|    | 53/100 [00:13<00:17,  2.68it/s] 54%|    | 54/100 [00:13<00:14,  3.25it/s] 55%|    | 55/100 [00:14<00:13,  3.34it/s] 56%|    | 56/100 [00:14<00:11,  3.96it/s] 57%|    | 57/100 [00:14<00:10,  4.03it/s] 58%|    | 58/100 [00:14<00:09,  4.44it/s] 59%|    | 59/100 [00:14<00:08,  5.12it/s] 60%|    | 60/100 [00:14<00:07,  5.53it/s]                                                {'loss': 0.6288, 'grad_norm': 0.491170197725296, 'learning_rate': 2.05e-05, 'epoch': 0.0}
- 60%|    | 60/100 [00:14<00:07,  5.53it/s] 61%|    | 61/100 [00:15<00:07,  5.17it/s] 62%|   | 62/100 [00:15<00:06,  5.71it/s] 63%|   | 63/100 [00:15<00:07,  5.15it/s] 64%|   | 64/100 [00:15<00:06,  5.44it/s] 65%|   | 65/100 [00:15<00:07,  4.96it/s] 66%|   | 66/100 [00:16<00:07,  4.58it/s] 67%|   | 67/100 [00:16<00:06,  4.86it/s] 68%|   | 68/100 [00:16<00:07,  4.53it/s] 69%|   | 69/100 [00:16<00:07,  4.15it/s] 70%|   | 70/100 [00:17<00:07,  4.09it/s]                                                {'loss': 0.6135, 'grad_norm': 0.5212703943252563, 'learning_rate': 1.55e-05, 'epoch': 0.01}
- 70%|   | 70/100 [00:17<00:07,  4.09it/s] 71%|   | 71/100 [00:17<00:07,  3.97it/s] 72%|  | 72/100 [00:17<00:06,  4.48it/s] 73%|  | 73/100 [00:17<00:06,  3.99it/s] 74%|  | 74/100 [00:18<00:06,  4.18it/s] 75%|  | 75/100 [00:18<00:05,  4.69it/s] 76%|  | 76/100 [00:18<00:05,  4.72it/s] 77%|  | 77/100 [00:18<00:04,  5.22it/s] 78%|  | 78/100 [00:18<00:04,  4.68it/s] 79%|  | 79/100 [00:19<00:04,  4.55it/s] 80%|  | 80/100 [00:19<00:04,  4.26it/s]                                                {'loss': 0.6435, 'grad_norm': 0.4012870192527771, 'learning_rate': 1.05e-05, 'epoch': 0.01}
- 80%|  | 80/100 [00:19<00:04,  4.26it/s] 81%|  | 81/100 [00:19<00:04,  4.65it/s] 82%| | 82/100 [00:19<00:04,  4.05it/s] 83%| | 83/100 [00:20<00:03,  4.45it/s] 84%| | 84/100 [00:20<00:03,  4.39it/s] 85%| | 85/100 [00:20<00:03,  4.45it/s] 86%| | 86/100 [00:20<00:02,  5.05it/s] 87%| | 87/100 [00:20<00:02,  5.18it/s] 88%| | 88/100 [00:21<00:02,  4.87it/s] 89%| | 89/100 [00:21<00:02,  4.59it/s] 90%| | 90/100 [00:21<00:01,  5.20it/s]                                                {'loss': 0.6313, 'grad_norm': 0.544455885887146, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.01}
- 90%| | 90/100 [00:21<00:01,  5.20it/s] 91%| | 91/100 [00:21<00:01,  4.63it/s] 92%|| 92/100 [00:21<00:01,  4.52it/s] 93%|| 93/100 [00:22<00:01,  4.74it/s] 94%|| 94/100 [00:22<00:01,  4.68it/s] 95%|| 95/100 [00:22<00:01,  4.77it/s] 96%|| 96/100 [00:22<00:00,  4.41it/s] 97%|| 97/100 [00:23<00:00,  3.84it/s] 98%|| 98/100 [00:23<00:00,  4.26it/s] 99%|| 99/100 [00:23<00:00,  4.53it/s]100%|| 100/100 [00:23<00:00,  4.30it/s]                                                 {'loss': 0.6241, 'grad_norm': 0.4397493600845337, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.01}
-100%|| 100/100 [00:23<00:00,  4.30it/s][INFO|trainer.py:4309] 2025-10-22 16:09:02,713 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
-[INFO|configuration_utils.py:765] 2025-10-22 16:09:02,882 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:09:02,883 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:09:03,031 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:09:03,066 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:09:03,087 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
-[INFO|trainer.py:2810] 2025-10-22 16:09:03,623 >> 
-
-Training completed. Do not forget to share your model on huggingface.co/models =)
-
-
-                                                 {'train_runtime': 26.0427, 'train_samples_per_second': 15.359, 'train_steps_per_second': 3.84, 'train_loss': 0.6805102682113647, 'epoch': 0.01}
-100%|| 100/100 [00:24<00:00,  4.30it/s]100%|| 100/100 [00:24<00:00,  4.04it/s]
-[INFO|trainer.py:4309] 2025-10-22 16:09:03,635 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-[INFO|configuration_utils.py:765] 2025-10-22 16:09:03,721 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:09:03,721 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:09:03,869 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:09:03,874 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:09:03,879 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
-***** train metrics *****
-  epoch                    =     0.0082
-  total_flos               =  1473847GF
-  train_loss               =     0.6805
-  train_runtime            = 0:00:26.04
-  train_samples_per_second =     15.359
-  train_steps_per_second   =       3.84
-[INFO|modelcard.py:456] 2025-10-22 16:09:04,104 >> Dropping the following result as it does not have all the necessary fields:
-{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
-gl064:2371876:2371876 [1] NCCL INFO comm 0x14ae6de0 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl064:2371875:2371875 [0] NCCL INFO comm 0x132ed7c0 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
-[1;34mwandb[0m: 
-[1;34mwandb[0m:  View run [33minteractive_test[0m at: [34m[0m
-[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_160837-4de4rspj/logs[0m
-
-========================================
-Training completed successfully
-End Time: Wed Oct 22 04:09:06 PM EDT 2025
-========================================
-
-========================================
-STAGE 2: Merging/Exporting Model
-Start Time: Wed Oct 22 04:09:06 PM EDT 2025
-========================================
-Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Only considering checkpoints created after training started at: Wed Oct 22 04:08:17 PM EDT 2025
-WARNING: No checkpoints found from current training run
-Falling back to most recent checkpoint overall
-Using fallback checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-Checkpoint details:
-  Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-  Last modified: 2025-10-22 16:02:30.204175325 -0400
-  Training step: 150
-Updating merge config to point to checkpoint...
-Successfully updated merge config
-Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-
-Merge config contents:
-  model_name_or_path: Qwen/Qwen2.5-0.5B
-  finetuning_type: lora
-  trust_remote_code: true
-  adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-  template: default
-  export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-
-Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,884 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,884 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,884 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,884 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,885 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,885 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:13,885 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:09:14,059 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:09:14,249 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:09:14,251 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:09:14,312 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:09:14,480 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:09:14,529 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:09:14,530 >> Model config Qwen2Config {
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 24,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
 
-[WARNING|logging.py:328] 2025-10-22 16:09:14,530 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|2025-10-22 16:09:14] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
-[WARNING|logging.py:328] 2025-10-22 16:09:14,861 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:09:14,861 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:09:14,862 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:09:14,863 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643
-}
+========================================
+Multi-Node Coordination
+========================================
+This is the master node - coordinating worker nodes...
+Master node: gl064
+Master port: 29500
+World size: 2
 
-[INFO|configuration_utils.py:941] 2025-10-22 16:09:14,967 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:09:14,968 >> Generate config GenerationConfig {
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "max_new_tokens": 2048
-}
+Launching on worker node 1: gl065
+All worker nodes launched successfully
+Master node (this node) will now join training as rank 0
 
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:09:14,997 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:09:14] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:09:16] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
-[INFO|2025-10-22 16:09:16] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-150
-[INFO|2025-10-22 16:09:16] llamafactory.model.loader:143 >> all params: 494,032,768
-[INFO|2025-10-22 16:09:16] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
-[INFO|configuration_utils.py:491] 2025-10-22 16:09:16,040 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
-[INFO|configuration_utils.py:757] 2025-10-22 16:09:16,045 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
-[INFO|modeling_utils.py:4181] 2025-10-22 16:09:17,844 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:09:17,850 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:09:17,855 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:09:17,860 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
-[INFO|2025-10-22 16:09:18] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
 
 ========================================
-Merge/Export completed successfully
-End Time: Wed Oct 22 04:09:18 PM EDT 2025
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:43:08 PM EDT 2025
 ========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 0
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+
+Starting distributed training with torch.distributed.run...
 
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
 ========================================
-Preparing Training Artifacts
+Job Name: lf_torch_test__interactive
+Hostname: gl065.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:43:13 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
 ========================================
-Copying configuration files...
-Copying and cleaning training logs...
-Training artifacts prepared in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/training_artifacts
-Contents:
-Log files:
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
 
 ========================================
-STAGE 3: Uploading to HuggingFace Hub
-Repository: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-Start Time: Wed Oct 22 04:09:18 PM EDT 2025
+Configuration Paths
 ========================================
-Uploading contents of: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
-Directory structure:
-
-Executing: huggingface-cli upload TAUR-dev/testing_llamafactory_helper_quick_test__interactive /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged .
-Start hashing 17 files.
-Finished hashing 17 files.
-[33m  Warning: 'huggingface-cli upload' is deprecated. Use 'hf upload' instead.[0m
-Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
-New Data Upload               : |          |  0.00B /  0.00B            [A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  11%|         |  109MB /  988MB            [A[A[A
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  11%|         |  109MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  12%|        |  120MB / 1.00GB,   ???B/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  24%|       |  235MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  25%|       |  246MB / 1.00GB,  628MB/s  
-
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
-
-
-  .../merged/model.safetensors:  36%|      |  352MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  36%|      |  364MB / 1.00GB,  609MB/s  
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
 
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:43:15 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 1
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
 
-  .../merged/model.safetensors:  48%|     |  478MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  49%|     |  490MB / 1.00GB,  615MB/s  
+Starting distributed training with torch.distributed.run...
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+========================================
+Job Name: lf_torch_test__interactive
+Hostname: gl065.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:45:12 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
+========================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
+Terminated
+ line 88, in _run_code
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 940, in <module>
+    main()
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 936, in main
+    run(args)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run
+    elastic_launch(
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 284, in launch_agent
+    result = agent.run()
+             ^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 717, in run
+    result = self._invoke_run(role)
+             ^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 875, in _invoke_run
+    self._initialize_workers(self._worker_group)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 685, in _initialize_workers
+    self._rendezvous(worker_group)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
+    rdzv_info = spec.rdzv_handler.next_rendezvous()
+                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
+    self._store = TCPStore(  # type: ignore[call-arg]
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.distributed.DistNetworkError: The server socket has failed to listen on any local network address. port: 29500, useIpv6: false, code: -98, name: EADDRINUSE, message: address already in use
+
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl064.hpc.nyu.edu
+Cleaning up processes on worker node: gl065
 
-  .../merged/model.safetensors:  60%|    |  596MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  61%|    |  607MB / 1.00GB,  608MB/s  
+erminated
+Process cleanup complete
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
 
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:45:14 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 1
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
 
-  .../merged/model.safetensors:  73%|  |  721MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  73%|  |  733MB / 1.00GB,  612MB/s  
+Starting distributed training with torch.distributed.run...
+/scratch/zrs2020/miniconda/miniconda3/bin/python: Error while finding module specification for 'torch.distributed.run' (ModuleNotFoundError: No module named 'torch')
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl065.hpc.nyu.edu
+Process cleanup complete
+========================================
+Job Name: lf_torch_test__interactive
+Hostname: gl064.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:45:32 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
+========================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
 
-  .../merged/model.safetensors:  86%| |  847MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  86%| |  859MB / 1.00GB,  615MB/s  
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
 
+========================================
+Multi-Node Coordination
+========================================
+This is the master node - coordinating worker nodes...
+Master node: gl064
+Master port: 29500
+World size: 2
 
-  .../merged/model.safetensors:  98%||  965MB /  988MB            [A[A[AProcessing Files (1 / 2)      :  98%||  976MB / 1.00GB,  611MB/s  
+Launching on worker node 1: gl065
+All worker nodes launched successfully
+Master node (this node) will now join training as rank 0
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
 
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:45:34 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 0
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
 
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (2 / 2)      : 100%|| 1.00GB / 1.00GB,  549MB/s  
+Starting distributed training with torch.distributed.run...
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 940, in <module>
+    main()
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 936, in main
+    run(args)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run
+    elastic_launch(
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 284, in launch_agent
+    result = agent.run()
+             ^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 717, in run
+    result = self._invoke_run(role)
+             ^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 875, in _invoke_run
+    self._initialize_workers(self._worker_group)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 685, in _initialize_workers
+    self._rendezvous(worker_group)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
+    rdzv_info = spec.rdzv_handler.next_rendezvous()
+                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
+    self._store = TCPStore(  # type: ignore[call-arg]
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.distributed.DistNetworkError: The server socket has failed to listen on any local network address. port: 29500, useIpv6: false, code: -98, name: EADDRINUSE, message: address already in use
+
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl064.hpc.nyu.edu
+Cleaning up processes on worker node: gl065
+Terminated
+Terminated
 
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[A
+nup complete
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
 
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:45:37 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 1
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
 
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[A
+Starting distributed training with torch.distributed.run...
+/scratch/zrs2020/miniconda/miniconda3/bin/python: Error while finding module specification for 'torch.distributed.run' (ModuleNotFoundError: No module named 'torch')
 
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            [A[A
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl065.hpc.nyu.edu
+Process cleanup complete
+========================================
+Job Name: lf_torch_test__interactive
+Hostname: gl065.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:48:00 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
+========================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
+Terminated
+ line 88, in _run_code
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 940, in <module>
+    main()
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 936, in main
+    run(args)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run
+    elastic_launch(
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 284, in launch_agent
+    result = agent.run()
+             ^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 717, in run
+    result = self._invoke_run(role)
+             ^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 875, in _invoke_run
+    self._initialize_workers(self._worker_group)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 685, in _initialize_workers
+    self._rendezvous(worker_group)
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous
+    rdzv_info = spec.rdzv_handler.next_rendezvous()
+                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous
+    self._store = TCPStore(  # type: ignore[call-arg]
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.distributed.DistNetworkError: The server socket has failed to listen on any local network address. port: 29500, useIpv6: false, code: -98, name: EADDRINUSE, message: address already in use
+
+========================================
+ERROR: Training failed with exit code 1
+========================================
 
-  .../merged/model.safetensors: 100%||  988MB /  988MB            [A[A[AProcessing Files (2 / 2)      : 100%|| 1.00GB / 1.00GB,  440MB/s  
-New Data Upload               : |          |  0.00B /  0.00B,  0.00B/s  
-  ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB            
-  .../merged/model.safetensors: 100%||  988MB /  988MB            
-Removing 14 file(s) from commit that have not changed.
-https://huggingface.co/TAUR-dev/testing_llamafactory_helper_quick_test__interactive/tree/main/.
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl064.hpc.nyu.edu
+Cleaning up processes on worker node: gl065
 
+erminated
+Process cleanup complete
 ========================================
-Upload completed successfully
-Model and training artifacts uploaded to: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
-End Time: Wed Oct 22 04:09:24 PM EDT 2025
+Configuration Paths
 ========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info: 
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
+
 
 ========================================
-STAGE 4: Cleanup
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:48:02 PM EDT 2025
 ========================================
-Keeping checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-Keeping merged model in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 1
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+
+Starting distributed training with torch.distributed.run...
+/scratch/zrs2020/miniconda/miniconda3/bin/python: Error while finding module specification for 'torch.distributed.run' (ModuleNotFoundError: No module named 'torch')
 
 ========================================
-PIPELINE COMPLETED SUCCESSFULLY
-End Time: Wed Oct 22 04:09:24 PM EDT 2025
+ERROR: Training failed with exit code 1
 ========================================
 
 ========================================
 Cleaning up LlamaFactory processes
 ========================================
-Cleaned up processes on gl064.hpc.nyu.edu
-Cleaning up processes on worker node: gl065
+Cleaned up processes on gl065.hpc.nyu.edu
 Process cleanup complete
 ========================================
 Job Name: lf_torch_test__interactive
 Hostname: gl064.hpc.nyu.edu
 Number of nodes: 2
 GPUs per node: 2
-Start Time: Wed Oct 22 04:11:30 PM EDT 2025
+Start Time: Wed Oct 22 04:53:28 PM EDT 2025
 Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
 ========================================
 Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
@@ -3565,7 +1473,7 @@ Master node (this node) will now join training as rank 0
 
 ========================================
 STAGE 1: Training Model
-Start Time: Wed Oct 22 04:11:33 PM EDT 2025
+Start Time: Wed Oct 22 04:53:31 PM EDT 2025
 ========================================
 Multi-node training detected
 Nodes: 2, GPUs per node: 2
@@ -3590,19 +1498,19 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default
   import pkg_resources
 /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
   import pkg_resources
-[INFO|2025-10-22 16:11:51] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
-[INFO|2025-10-22 16:11:51] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
-[INFO|2025-10-22 16:11:51] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:11:52,077 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:11:52,302 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:11:52,304 >> Model config Qwen2Config {
+[INFO|2025-10-22 16:53:48] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
+[INFO|2025-10-22 16:53:48] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
+[INFO|2025-10-22 16:53:48] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,453 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:53:48,624 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:53:48,834 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:53:48,837 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -3658,88 +1566,88 @@ Setting OMP_NUM_THREADS environment variable for each process to be 1 in default
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,370 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:11:52,534 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|2025-10-22 16:11:52] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:53:48,899 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:53:49,064 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-10-22 16:53:49] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
 /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4876: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
   warnings.warn(  # warn only once
-[rank0]:[W1022 16:11:53.769183387 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
-gl064:2373549:2373549 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2373549:2373549 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
-gl064:2373549:2373549 [0] NCCL INFO cudaDriverVersion 13000
-gl064:2373549:2373549 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl064:2373549:2373549 [0] NCCL INFO Comm config Blocking set to 1
-gl064:2373550:2373550 [1] NCCL INFO cudaDriverVersion 13000
-gl064:2373550:2373550 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2373550:2373550 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
-gl064:2373550:2373550 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl064:2373550:2373550 [1] NCCL INFO Comm config Blocking set to 1
-gl064:2373549:2373629 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl064:2373549:2373629 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl064:2373549:2373629 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2373549:2373629 [0] NCCL INFO NCCL_IB_HCA set to mlx5
-gl064:2373550:2373630 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
-gl064:2373550:2373630 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl064:2373550:2373630 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2373550:2373630 [1] NCCL INFO NCCL_IB_HCA set to mlx5
-gl064:2373549:2373629 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
-gl064:2373549:2373629 [0] NCCL INFO Initialized NET plugin IB
-gl064:2373549:2373629 [0] NCCL INFO Assigned NET plugin IB to comm
-gl064:2373549:2373629 [0] NCCL INFO Using network IB
-gl064:2373549:2373629 [0] NCCL INFO ncclCommInitRankConfig comm 0x156177e0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x94e527c6b9214f3c - Init START
-gl064:2373550:2373630 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
-gl064:2373550:2373630 [1] NCCL INFO Initialized NET plugin IB
-gl064:2373550:2373630 [1] NCCL INFO Assigned NET plugin IB to comm
-gl064:2373550:2373630 [1] NCCL INFO Using network IB
-gl064:2373550:2373630 [1] NCCL INFO ncclCommInitRankConfig comm 0x123b0010 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x94e527c6b9214f3c - Init START
-gl064:2373550:2373630 [1] NCCL INFO RAS client listening socket at ::1<28028>
-gl064:2373549:2373629 [0] NCCL INFO RAS client listening socket at ::1<28028>
-gl064:2373550:2373630 [1] NCCL INFO Bootstrap timings total 0.008614 (create 0.000021, send 0.000072, recv 0.006619, ring 0.001056, delay 0.000000)
-gl064:2373549:2373629 [0] NCCL INFO Bootstrap timings total 0.019798 (create 0.000024, send 0.000210, recv 0.001659, ring 0.000908, delay 0.000000)
-gl064:2373550:2373630 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
-gl064:2373549:2373629 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
-gl064:2373549:2373629 [0] NCCL INFO comm 0x156177e0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
-gl064:2373550:2373630 [1] NCCL INFO comm 0x123b0010 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
-gl064:2373549:2373629 [0] NCCL INFO Channel 00/02 : 0 1 2 3
-gl064:2373549:2373629 [0] NCCL INFO Channel 01/02 : 0 1 2 3
-gl064:2373550:2373630 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
-gl064:2373549:2373629 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
-gl064:2373549:2373629 [0] NCCL INFO P2P Chunksize set to 131072
-gl064:2373550:2373630 [1] NCCL INFO P2P Chunksize set to 131072
-gl064:2373550:2373630 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl064:2373549:2373629 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
-gl064:2373549:2373629 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
-gl064:2373550:2373635 [1] NCCL INFO [Proxy Service] Device 1 CPU core 10
-gl064:2373549:2373636 [0] NCCL INFO [Proxy Service] Device 0 CPU core 11
-gl064:2373550:2373637 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 12
-gl064:2373549:2373638 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 13
-gl064:2373550:2373630 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl064:2373550:2373630 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl064:2373549:2373629 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl064:2373549:2373629 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl064:2373549:2373629 [0] NCCL INFO CC Off, workFifoBytes 1048576
-gl064:2373550:2373630 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl064:2373550:2373630 [1] NCCL INFO ncclCommInitRankConfig comm 0x123b0010 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x94e527c6b9214f3c - Init COMPLETE
-gl064:2373550:2373630 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.13 (kernels 0.09, alloc 0.01, bootstrap 0.01, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
-gl064:2373549:2373629 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl064:2373549:2373629 [0] NCCL INFO ncclCommInitRankConfig comm 0x156177e0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x94e527c6b9214f3c - Init COMPLETE
-gl064:2373549:2373629 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.14 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
-gl064:2373549:2373640 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
-gl064:2373549:2373641 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 0
-gl064:2373549:2373640 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
-gl064:2373549:2373640 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
-gl064:2373549:2373640 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
-gl064:2373550:2373639 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
-gl064:2373550:2373639 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
-gl064:2373550:2373642 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 7
-gl064:2373549:2373640 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-gl064:2373550:2373639 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+[rank0]:[W1022 16:53:49.255685164 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
+gl064:2389573:2389573 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2389573:2389573 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
+gl064:2389573:2389573 [0] NCCL INFO cudaDriverVersion 13000
+gl064:2389573:2389573 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl064:2389573:2389573 [0] NCCL INFO Comm config Blocking set to 1
+gl064:2389574:2389574 [1] NCCL INFO cudaDriverVersion 13000
+gl064:2389574:2389574 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2389574:2389574 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
+gl064:2389574:2389574 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl064:2389574:2389574 [1] NCCL INFO Comm config Blocking set to 1
+gl064:2389573:2389623 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
+gl064:2389573:2389623 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl064:2389573:2389623 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2389573:2389623 [0] NCCL INFO NCCL_IB_HCA set to mlx5
+gl064:2389574:2389624 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. 
+gl064:2389574:2389624 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl064:2389574:2389624 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2389574:2389624 [1] NCCL INFO NCCL_IB_HCA set to mlx5
+gl064:2389573:2389623 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
+gl064:2389573:2389623 [0] NCCL INFO Initialized NET plugin IB
+gl064:2389573:2389623 [0] NCCL INFO Assigned NET plugin IB to comm
+gl064:2389573:2389623 [0] NCCL INFO Using network IB
+gl064:2389573:2389623 [0] NCCL INFO ncclCommInitRankConfig comm 0x13ad6a40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x34b3627a2ac82e5c - Init START
+gl064:2389574:2389624 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
+gl064:2389574:2389624 [1] NCCL INFO Initialized NET plugin IB
+gl064:2389574:2389624 [1] NCCL INFO Assigned NET plugin IB to comm
+gl064:2389574:2389624 [1] NCCL INFO Using network IB
+gl064:2389574:2389624 [1] NCCL INFO ncclCommInitRankConfig comm 0x1230b120 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x34b3627a2ac82e5c - Init START
+gl064:2389573:2389623 [0] NCCL INFO RAS client listening socket at ::1<28028>
+gl064:2389574:2389624 [1] NCCL INFO RAS client listening socket at ::1<28028>
+gl064:2389574:2389624 [1] NCCL INFO Bootstrap timings total 0.197747 (create 0.000021, send 0.000081, recv 0.195860, ring 0.001046, delay 0.000000)
+gl064:2389573:2389623 [0] NCCL INFO Bootstrap timings total 0.200890 (create 0.000021, send 0.000184, recv 0.003256, ring 0.177280, delay 0.000000)
+gl064:2389573:2389623 [0] NCCL INFO Setting affinity for GPU 0 to 0-31
+gl064:2389574:2389624 [1] NCCL INFO Setting affinity for GPU 1 to 0-31
+gl064:2389573:2389623 [0] NCCL INFO comm 0x13ad6a40 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
+gl064:2389574:2389624 [1] NCCL INFO comm 0x1230b120 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
+gl064:2389573:2389623 [0] NCCL INFO Channel 00/02 : 0 1 2 3
+gl064:2389574:2389624 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
+gl064:2389573:2389623 [0] NCCL INFO Channel 01/02 : 0 1 2 3
+gl064:2389574:2389624 [1] NCCL INFO P2P Chunksize set to 131072
+gl064:2389573:2389623 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
+gl064:2389573:2389623 [0] NCCL INFO P2P Chunksize set to 131072
+gl064:2389574:2389624 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+gl064:2389573:2389623 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
+gl064:2389573:2389623 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+gl064:2389573:2389629 [0] NCCL INFO [Proxy Service] Device 0 CPU core 23
+gl064:2389573:2389631 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 26
+gl064:2389574:2389632 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 6
+gl064:2389574:2389630 [1] NCCL INFO [Proxy Service] Device 1 CPU core 4
+gl064:2389574:2389624 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl064:2389574:2389624 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl064:2389573:2389623 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl064:2389573:2389623 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl064:2389573:2389623 [0] NCCL INFO CC Off, workFifoBytes 1048576
+gl064:2389574:2389624 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl064:2389574:2389624 [1] NCCL INFO ncclCommInitRankConfig comm 0x1230b120 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x34b3627a2ac82e5c - Init COMPLETE
+gl064:2389574:2389624 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.33 (kernels 0.09, alloc 0.01, bootstrap 0.20, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
+gl064:2389573:2389623 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl064:2389573:2389623 [0] NCCL INFO ncclCommInitRankConfig comm 0x13ad6a40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x34b3627a2ac82e5c - Init COMPLETE
+gl064:2389573:2389623 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.33 (kernels 0.09, alloc 0.01, bootstrap 0.20, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
+gl064:2389573:2389633 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
+gl064:2389573:2389633 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
+gl064:2389573:2389635 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 25
+gl064:2389573:2389633 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
+gl064:2389573:2389633 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
+gl064:2389574:2389634 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
+gl064:2389574:2389634 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
+gl064:2389574:2389636 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 8
+gl064:2389573:2389633 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+gl064:2389574:2389634 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
 training example:
 input_ids:
 [33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
@@ -3991,8 +1899,8 @@ Hence, the correct answer is:
 (67 + 31) + 71
 </answer><|endoftext|>
 
-[INFO|configuration_utils.py:765] 2025-10-22 16:11:53,824 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:11:53,825 >> Model config Qwen2Config {
+[INFO|configuration_utils.py:765] 2025-10-22 16:53:50,512 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:53:50,512 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -4048,62 +1956,61 @@ Hence, the correct answer is:
   "vocab_size": 151936
 }
 
-[INFO|2025-10-22 16:11:53] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
-[WARNING|logging.py:328] 2025-10-22 16:11:54,141 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:11:54,143 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:11:54,144 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:11:54,145 >> Generate config GenerationConfig {
+[INFO|2025-10-22 16:53:50] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
+[WARNING|logging.py:328] 2025-10-22 16:53:50,871 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1172] 2025-10-22 16:53:50,872 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
+[INFO|modeling_utils.py:2341] 2025-10-22 16:53:50,873 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
+[INFO|configuration_utils.py:986] 2025-10-22 16:53:50,874 >> Generate config GenerationConfig {
   "bos_token_id": 151643,
   "eos_token_id": 151643,
   "use_cache": false
 }
 
 `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|configuration_utils.py:941] 2025-10-22 16:11:54,648 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:11:54,648 >> Generate config GenerationConfig {
+[INFO|configuration_utils.py:941] 2025-10-22 16:53:51,142 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-22 16:53:51,142 >> Generate config GenerationConfig {
   "bos_token_id": 151643,
   "eos_token_id": 151643,
   "max_new_tokens": 2048
 }
 
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:11:54,677 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:11:54] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
-[INFO|2025-10-22 16:11:54] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:11:54] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
-[INFO|2025-10-22 16:11:54] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
-[INFO|2025-10-22 16:11:54] llamafactory.model.model_utils.misc:143 >> Found linear modules: o_proj,gate_proj,k_proj,v_proj,up_proj,down_proj,q_proj
-[INFO|2025-10-22 16:11:54] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
+[INFO|dynamic_module_utils.py:423] 2025-10-22 16:53:51,176 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
+[INFO|2025-10-22 16:53:51] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
+[INFO|2025-10-22 16:53:51] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-10-22 16:53:51] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
+[INFO|2025-10-22 16:53:51] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
+[INFO|2025-10-22 16:53:51] llamafactory.model.model_utils.misc:143 >> Found linear modules: v_proj,down_proj,k_proj,up_proj,gate_proj,o_proj,q_proj
+[INFO|2025-10-22 16:53:51] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
+[WARNING|trainer.py:906] 2025-10-22 16:53:51,421 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
+[INFO|trainer.py:699] 2025-10-22 16:53:51,423 >> max_steps is given, it will override any value given in num_train_epochs
+[INFO|trainer.py:749] 2025-10-22 16:53:51,424 >> Using auto half precision backend
+[WARNING|trainer.py:982] 2025-10-22 16:53:51,425 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
 The model is already on multiple devices. Skipping the move to device specified in `args`.
 The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-[WARNING|trainer.py:906] 2025-10-22 16:11:54,922 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
-[INFO|trainer.py:699] 2025-10-22 16:11:54,925 >> max_steps is given, it will override any value given in num_train_epochs
-[INFO|trainer.py:749] 2025-10-22 16:11:54,925 >> Using auto half precision backend
-[WARNING|2025-10-22 16:11:54] llamafactory.train.callbacks:154 >> Previous trainer log in this folder will be deleted.
-[WARNING|trainer.py:982] 2025-10-22 16:11:54,927 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-[INFO|trainer.py:2519] 2025-10-22 16:11:55,091 >> ***** Running training *****
-[INFO|trainer.py:2520] 2025-10-22 16:11:55,091 >>   Num examples = 48,600
-[INFO|trainer.py:2521] 2025-10-22 16:11:55,091 >>   Num Epochs = 1
-[INFO|trainer.py:2522] 2025-10-22 16:11:55,091 >>   Instantaneous batch size per device = 1
-[INFO|trainer.py:2525] 2025-10-22 16:11:55,091 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2526] 2025-10-22 16:11:55,091 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2527] 2025-10-22 16:11:55,091 >>   Total optimization steps = 100
-[INFO|trainer.py:2528] 2025-10-22 16:11:55,092 >>   Number of trainable parameters = 4,399,104
-[INFO|integration_utils.py:867] 2025-10-22 16:11:55,115 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
+[INFO|trainer.py:2519] 2025-10-22 16:53:51,656 >> ***** Running training *****
+[INFO|trainer.py:2520] 2025-10-22 16:53:51,656 >>   Num examples = 48,600
+[INFO|trainer.py:2521] 2025-10-22 16:53:51,656 >>   Num Epochs = 1
+[INFO|trainer.py:2522] 2025-10-22 16:53:51,656 >>   Instantaneous batch size per device = 1
+[INFO|trainer.py:2525] 2025-10-22 16:53:51,656 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
+[INFO|trainer.py:2526] 2025-10-22 16:53:51,656 >>   Gradient Accumulation steps = 1
+[INFO|trainer.py:2527] 2025-10-22 16:53:51,656 >>   Total optimization steps = 100
+[INFO|trainer.py:2528] 2025-10-22 16:53:51,658 >>   Number of trainable parameters = 4,399,104
+[INFO|integration_utils.py:867] 2025-10-22 16:53:51,679 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
 wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
 wandb: Tracking run with wandb version 0.22.2
-wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_161155-mev7yv4q
+wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_165351-ytl2gm77
 wandb: Run `wandb offline` to turn off syncing.
 wandb: Syncing run interactive_test
 wandb:  View project at https://wandb.ai/ut_nlp_deduce/llamafactory
-wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/mev7yv4q
-  0%|          | 0/100 [00:00<?, ?it/s]  1%|          | 1/100 [00:00<01:20,  1.22it/s]  2%|         | 2/100 [00:00<00:42,  2.29it/s]  3%|         | 3/100 [00:01<00:30,  3.23it/s]  4%|         | 4/100 [00:01<00:26,  3.62it/s]  5%|         | 5/100 [00:01<00:26,  3.62it/s]  6%|         | 6/100 [00:01<00:25,  3.75it/s]  7%|         | 7/100 [00:02<00:24,  3.81it/s]  8%|         | 8/100 [00:02<00:22,  4.10it/s]  9%|         | 9/100 [00:02<00:21,  4.26it/s] 10%|         | 10/100 [00:02<00:19,  4.60it/s]                                                {'loss': 0.8095, 'grad_norm': 0.4082973599433899, 'learning_rate': 4.55e-05, 'epoch': 0.0}
- 10%|         | 10/100 [00:02<00:19,  4.60it/s] 11%|         | 11/100 [00:02<00:19,  4.58it/s] 12%|        | 12/100 [00:03<00:26,  3.37it/s] 13%|        | 13/100 [00:03<00:22,  3.95it/s] 14%|        | 14/100 [00:03<00:19,  4.49it/s] 15%|        | 15/100 [00:03<00:17,  4.83it/s] 16%|        | 16/100 [00:04<00:16,  5.02it/s] 17%|        | 17/100 [00:04<00:18,  4.54it/s] 18%|        | 18/100 [00:04<00:19,  4.31it/s] 19%|        | 19/100 [00:04<00:19,  4.09it/s] 20%|        | 20/100 [00:05<00:18,  4.30it/s]                                                {'loss': 0.7526, 'grad_norm': 0.3977344334125519, 'learning_rate': 4.05e-05, 'epoch': 0.0}
- 20%|        | 20/100 [00:05<00:18,  4.30it/s] 21%|        | 21/100 [00:05<00:19,  4.15it/s] 22%|       | 22/100 [00:05<00:17,  4.34it/s] 23%|       | 23/100 [00:05<00:18,  4.17it/s] 24%|       | 24/100 [00:06<00:16,  4.48it/s] 25%|       | 25/100 [00:06<00:18,  4.07it/s] 26%|       | 26/100 [00:06<00:17,  4.20it/s] 27%|       | 27/100 [00:06<00:17,  4.12it/s] 28%|       | 28/100 [00:07<00:19,  3.67it/s] 29%|       | 29/100 [00:07<00:18,  3.77it/s] 30%|       | 30/100 [00:07<00:18,  3.87it/s]                                                {'loss': 0.7383, 'grad_norm': 0.4655061662197113, 'learning_rate': 3.55e-05, 'epoch': 0.0}
- 30%|       | 30/100 [00:07<00:18,  3.87it/s] 31%|       | 31/100 [00:07<00:17,  3.84it/s] 32%|      | 32/100 [00:08<00:16,  4.06it/s] 33%|      | 33/100 [00:08<00:15,  4.45it/s] 34%|      | 34/100 [00:08<00:13,  4.89it/s] 35%|      | 35/100 [00:08<00:14,  4.58it/s] 36%|      | 36/100 [00:08<00:12,  5.03it/s] 37%|      | 37/100 [00:09<00:13,  4.70it/s] 38%|      | 38/100 [00:09<00:13,  4.66it/s] 39%|      | 39/100 [00:09<00:12,  4.98it/s] 40%|      | 40/100 [00:09<00:13,  4.59it/s]                                                {'loss': 0.7139, 'grad_norm': 0.37491023540496826, 'learning_rate': 3.05e-05, 'epoch': 0.0}
- 40%|      | 40/100 [00:09<00:13,  4.59it/s] 41%|      | 41/100 [00:10<00:14,  4.10it/s] 42%|     | 42/100 [00:10<00:14,  4.08it/s] 43%|     | 43/100 [00:10<00:12,  4.58it/s] 44%|     | 44/100 [00:10<00:11,  4.90it/s] 45%|     | 45/100 [00:10<00:10,  5.33it/s] 46%|     | 46/100 [00:10<00:10,  4.98it/s] 47%|     | 47/100 [00:11<00:10,  4.89it/s] 48%|     | 48/100 [00:11<00:10,  5.02it/s] 49%|     | 49/100 [00:11<00:11,  4.44it/s] 50%|     | 50/100 [00:11<00:11,  4.45it/s]                                                {'loss': 0.6497, 'grad_norm': 0.5901727676391602, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.0}
- 50%|     | 50/100 [00:11<00:11,  4.45it/s][INFO|trainer.py:4309] 2025-10-22 16:12:08,223 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
-[INFO|configuration_utils.py:765] 2025-10-22 16:12:08,395 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:12:08,396 >> Model config Qwen2Config {
+wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/ytl2gm77
+  0%|          | 0/100 [00:00<?, ?it/s]  1%|          | 1/100 [00:00<00:51,  1.91it/s]  2%|         | 2/100 [00:00<00:30,  3.17it/s]  3%|         | 3/100 [00:00<00:23,  4.09it/s]  4%|         | 4/100 [00:01<00:22,  4.23it/s]  5%|         | 5/100 [00:01<00:33,  2.86it/s]  6%|         | 6/100 [00:01<00:29,  3.17it/s]  7%|         | 7/100 [00:02<00:27,  3.39it/s]  8%|         | 8/100 [00:02<00:24,  3.75it/s]  9%|         | 9/100 [00:02<00:30,  2.94it/s] 10%|         | 10/100 [00:03<00:26,  3.45it/s]                                                {'loss': 0.8094, 'grad_norm': 0.40798208117485046, 'learning_rate': 4.55e-05, 'epoch': 0.0}
+ 10%|         | 10/100 [00:03<00:26,  3.45it/s] 11%|         | 11/100 [00:03<00:23,  3.73it/s] 12%|        | 12/100 [00:03<00:29,  3.02it/s] 13%|        | 13/100 [00:03<00:24,  3.61it/s] 14%|        | 14/100 [00:04<00:20,  4.18it/s] 15%|        | 15/100 [00:04<00:18,  4.58it/s] 16%|        | 16/100 [00:04<00:17,  4.82it/s] 17%|        | 17/100 [00:04<00:18,  4.43it/s] 18%|        | 18/100 [00:04<00:19,  4.24it/s] 19%|        | 19/100 [00:05<00:20,  4.05it/s] 20%|        | 20/100 [00:05<00:19,  4.18it/s]                                                {'loss': 0.7526, 'grad_norm': 0.3975208103656769, 'learning_rate': 4.05e-05, 'epoch': 0.0}
+ 20%|        | 20/100 [00:05<00:19,  4.18it/s] 21%|        | 21/100 [00:05<00:19,  4.08it/s] 22%|       | 22/100 [00:05<00:19,  4.07it/s] 23%|       | 23/100 [00:06<00:19,  3.99it/s] 24%|       | 24/100 [00:06<00:17,  4.34it/s] 25%|       | 25/100 [00:06<00:18,  3.98it/s] 26%|       | 26/100 [00:06<00:17,  4.15it/s] 27%|       | 27/100 [00:07<00:17,  4.08it/s] 28%|       | 28/100 [00:07<00:19,  3.66it/s] 29%|       | 29/100 [00:07<00:18,  3.76it/s] 30%|       | 30/100 [00:07<00:18,  3.86it/s]                                                {'loss': 0.7383, 'grad_norm': 0.46554315090179443, 'learning_rate': 3.55e-05, 'epoch': 0.0}
+ 30%|       | 30/100 [00:07<00:18,  3.86it/s] 31%|       | 31/100 [00:08<00:17,  3.84it/s] 32%|      | 32/100 [00:08<00:16,  4.06it/s] 33%|      | 33/100 [00:08<00:15,  4.46it/s] 34%|      | 34/100 [00:08<00:13,  4.90it/s] 35%|      | 35/100 [00:08<00:14,  4.60it/s] 36%|      | 36/100 [00:09<00:12,  5.06it/s] 37%|      | 37/100 [00:09<00:13,  4.73it/s] 38%|      | 38/100 [00:09<00:13,  4.68it/s] 39%|      | 39/100 [00:09<00:12,  5.00it/s] 40%|      | 40/100 [00:10<00:13,  4.61it/s]                                                {'loss': 0.7139, 'grad_norm': 0.37474921345710754, 'learning_rate': 3.05e-05, 'epoch': 0.0}
+ 40%|      | 40/100 [00:10<00:13,  4.61it/s] 41%|      | 41/100 [00:10<00:14,  4.11it/s] 42%|     | 42/100 [00:10<00:14,  4.09it/s] 43%|     | 43/100 [00:10<00:12,  4.60it/s] 44%|     | 44/100 [00:10<00:11,  4.91it/s] 45%|     | 45/100 [00:11<00:10,  5.34it/s] 46%|     | 46/100 [00:11<00:10,  4.99it/s] 47%|     | 47/100 [00:11<00:10,  4.90it/s] 48%|     | 48/100 [00:11<00:10,  5.15it/s] 49%|     | 49/100 [00:11<00:11,  4.52it/s] 50%|     | 50/100 [00:12<00:11,  4.50it/s]                                                {'loss': 0.6497, 'grad_norm': 0.5890421867370605, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.0}
+ 50%|     | 50/100 [00:12<00:11,  4.50it/s][INFO|trainer.py:4309] 2025-10-22 16:54:04,832 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
+[INFO|configuration_utils.py:765] 2025-10-22 16:54:05,048 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:54:05,049 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -4159,17 +2066,17 @@ wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/mev7yv4q
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:08,565 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:08,585 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:08,589 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
- 51%|     | 51/100 [00:13<00:24,  2.02it/s] 52%|    | 52/100 [00:13<00:20,  2.31it/s] 53%|    | 53/100 [00:13<00:17,  2.67it/s] 54%|    | 54/100 [00:13<00:14,  3.24it/s] 55%|    | 55/100 [00:13<00:13,  3.34it/s] 56%|    | 56/100 [00:14<00:11,  3.95it/s] 57%|    | 57/100 [00:14<00:10,  4.02it/s] 58%|    | 58/100 [00:14<00:09,  4.44it/s] 59%|    | 59/100 [00:14<00:07,  5.13it/s] 60%|    | 60/100 [00:14<00:07,  5.54it/s]                                                {'loss': 0.6288, 'grad_norm': 0.4912378787994385, 'learning_rate': 2.05e-05, 'epoch': 0.0}
- 60%|    | 60/100 [00:14<00:07,  5.54it/s] 61%|    | 61/100 [00:15<00:07,  5.17it/s] 62%|   | 62/100 [00:15<00:06,  5.72it/s] 63%|   | 63/100 [00:15<00:07,  5.16it/s] 64%|   | 64/100 [00:15<00:06,  5.46it/s] 65%|   | 65/100 [00:15<00:07,  4.98it/s] 66%|   | 66/100 [00:16<00:07,  4.60it/s] 67%|   | 67/100 [00:16<00:06,  4.87it/s] 68%|   | 68/100 [00:16<00:07,  4.54it/s] 69%|   | 69/100 [00:16<00:07,  4.16it/s] 70%|   | 70/100 [00:17<00:07,  4.10it/s]                                                {'loss': 0.6135, 'grad_norm': 0.521141767501831, 'learning_rate': 1.55e-05, 'epoch': 0.01}
- 70%|   | 70/100 [00:17<00:07,  4.10it/s] 71%|   | 71/100 [00:17<00:07,  3.97it/s] 72%|  | 72/100 [00:17<00:06,  4.49it/s] 73%|  | 73/100 [00:17<00:06,  4.00it/s] 74%|  | 74/100 [00:17<00:06,  4.20it/s] 75%|  | 75/100 [00:18<00:05,  4.70it/s] 76%|  | 76/100 [00:18<00:05,  4.73it/s] 77%|  | 77/100 [00:18<00:04,  5.24it/s] 78%|  | 78/100 [00:18<00:04,  4.69it/s] 79%|  | 79/100 [00:18<00:04,  4.55it/s] 80%|  | 80/100 [00:19<00:04,  4.27it/s]                                                {'loss': 0.6435, 'grad_norm': 0.4013785123825073, 'learning_rate': 1.05e-05, 'epoch': 0.01}
- 80%|  | 80/100 [00:19<00:04,  4.27it/s] 81%|  | 81/100 [00:19<00:04,  4.66it/s] 82%| | 82/100 [00:19<00:04,  4.06it/s] 83%| | 83/100 [00:19<00:03,  4.46it/s] 84%| | 84/100 [00:20<00:03,  4.40it/s] 85%| | 85/100 [00:20<00:03,  4.46it/s] 86%| | 86/100 [00:20<00:02,  5.06it/s] 87%| | 87/100 [00:20<00:02,  5.19it/s] 88%| | 88/100 [00:20<00:02,  4.88it/s] 89%| | 89/100 [00:21<00:02,  4.59it/s] 90%| | 90/100 [00:21<00:01,  5.20it/s]                                                {'loss': 0.6314, 'grad_norm': 0.544479489326477, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.01}
- 90%| | 90/100 [00:21<00:01,  5.20it/s] 91%| | 91/100 [00:21<00:01,  4.64it/s] 92%|| 92/100 [00:21<00:01,  4.52it/s] 93%|| 93/100 [00:21<00:01,  4.74it/s] 94%|| 94/100 [00:22<00:01,  4.69it/s] 95%|| 95/100 [00:22<00:01,  4.78it/s] 96%|| 96/100 [00:22<00:00,  4.42it/s] 97%|| 97/100 [00:23<00:00,  3.84it/s] 98%|| 98/100 [00:23<00:00,  4.26it/s] 99%|| 99/100 [00:23<00:00,  4.53it/s]100%|| 100/100 [00:23<00:00,  4.31it/s]                                                 {'loss': 0.6241, 'grad_norm': 0.4398234486579895, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.01}
-100%|| 100/100 [00:23<00:00,  4.31it/s][INFO|trainer.py:4309] 2025-10-22 16:12:19,957 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
-[INFO|configuration_utils.py:765] 2025-10-22 16:12:20,123 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:12:20,124 >> Model config Qwen2Config {
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:54:05,208 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:54:05,213 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:54:05,232 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
+ 51%|     | 51/100 [00:13<00:24,  1.98it/s] 52%|    | 52/100 [00:13<00:21,  2.27it/s] 53%|    | 53/100 [00:13<00:17,  2.64it/s] 54%|    | 54/100 [00:14<00:14,  3.21it/s] 55%|    | 55/100 [00:14<00:13,  3.31it/s] 56%|    | 56/100 [00:14<00:11,  3.93it/s] 57%|    | 57/100 [00:14<00:10,  4.01it/s] 58%|    | 58/100 [00:14<00:09,  4.43it/s] 59%|    | 59/100 [00:14<00:08,  5.12it/s] 60%|    | 60/100 [00:15<00:07,  5.52it/s]                                                {'loss': 0.6288, 'grad_norm': 0.49134066700935364, 'learning_rate': 2.05e-05, 'epoch': 0.0}
+ 60%|    | 60/100 [00:15<00:07,  5.52it/s] 61%|    | 61/100 [00:15<00:07,  5.16it/s] 62%|   | 62/100 [00:15<00:06,  5.72it/s] 63%|   | 63/100 [00:15<00:07,  5.16it/s] 64%|   | 64/100 [00:15<00:06,  5.46it/s] 65%|   | 65/100 [00:16<00:07,  4.98it/s] 66%|   | 66/100 [00:16<00:07,  4.60it/s] 67%|   | 67/100 [00:16<00:06,  4.87it/s] 68%|   | 68/100 [00:16<00:07,  4.55it/s] 69%|   | 69/100 [00:17<00:07,  4.16it/s] 70%|   | 70/100 [00:17<00:07,  4.11it/s]                                                {'loss': 0.6135, 'grad_norm': 0.5212565660476685, 'learning_rate': 1.55e-05, 'epoch': 0.01}
+ 70%|   | 70/100 [00:17<00:07,  4.11it/s] 71%|   | 71/100 [00:17<00:07,  3.99it/s] 72%|  | 72/100 [00:17<00:06,  4.50it/s] 73%|  | 73/100 [00:18<00:06,  4.01it/s] 74%|  | 74/100 [00:18<00:06,  4.20it/s] 75%|  | 75/100 [00:18<00:05,  4.71it/s] 76%|  | 76/100 [00:18<00:05,  4.74it/s] 77%|  | 77/100 [00:18<00:04,  5.25it/s] 78%|  | 78/100 [00:19<00:04,  4.70it/s] 79%|  | 79/100 [00:19<00:04,  4.56it/s] 80%|  | 80/100 [00:19<00:04,  4.27it/s]                                                {'loss': 0.6435, 'grad_norm': 0.40127691626548767, 'learning_rate': 1.05e-05, 'epoch': 0.01}
+ 80%|  | 80/100 [00:19<00:04,  4.27it/s] 81%|  | 81/100 [00:19<00:04,  4.66it/s] 82%| | 82/100 [00:20<00:04,  4.07it/s] 83%| | 83/100 [00:20<00:03,  4.46it/s] 84%| | 84/100 [00:20<00:03,  4.41it/s] 85%| | 85/100 [00:20<00:03,  4.47it/s] 86%| | 86/100 [00:20<00:02,  5.07it/s] 87%| | 87/100 [00:20<00:02,  5.19it/s] 88%| | 88/100 [00:21<00:02,  4.89it/s] 89%| | 89/100 [00:21<00:02,  4.60it/s] 90%| | 90/100 [00:21<00:01,  5.22it/s]                                                {'loss': 0.6313, 'grad_norm': 0.5443973541259766, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.01}
+ 90%| | 90/100 [00:21<00:01,  5.22it/s] 91%| | 91/100 [00:21<00:01,  4.65it/s] 92%|| 92/100 [00:22<00:02,  3.71it/s] 93%|| 93/100 [00:22<00:01,  4.09it/s] 94%|| 94/100 [00:22<00:01,  4.23it/s] 95%|| 95/100 [00:22<00:01,  4.43it/s] 96%|| 96/100 [00:23<00:00,  4.21it/s] 97%|| 97/100 [00:23<00:00,  3.73it/s] 98%|| 98/100 [00:23<00:00,  4.17it/s] 99%|| 99/100 [00:23<00:00,  4.46it/s]100%|| 100/100 [00:24<00:00,  4.27it/s]                                                 {'loss': 0.6241, 'grad_norm': 0.44017264246940613, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.01}
+100%|| 100/100 [00:24<00:00,  4.27it/s][INFO|trainer.py:4309] 2025-10-22 16:54:16,736 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+[INFO|configuration_utils.py:765] 2025-10-22 16:54:16,832 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:54:16,833 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -4225,19 +2132,19 @@ wandb:  View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/mev7yv4q
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:20,301 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:20,337 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:20,361 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
-[INFO|trainer.py:2810] 2025-10-22 16:12:20,905 >> 
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:54:16,956 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:54:16,960 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:54:16,964 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
+[INFO|trainer.py:2810] 2025-10-22 16:54:17,417 >> 
 
 Training completed. Do not forget to share your model on huggingface.co/models =)
 
 
-                                                 {'train_runtime': 25.8131, 'train_samples_per_second': 15.496, 'train_steps_per_second': 3.874, 'train_loss': 0.6805157041549683, 'epoch': 0.01}
-100%|| 100/100 [00:24<00:00,  4.31it/s]100%|| 100/100 [00:24<00:00,  4.07it/s]
-[INFO|trainer.py:4309] 2025-10-22 16:12:20,914 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
-[INFO|configuration_utils.py:765] 2025-10-22 16:12:20,991 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:12:20,992 >> Model config Qwen2Config {
+                                                 {'train_runtime': 25.7603, 'train_samples_per_second': 15.528, 'train_steps_per_second': 3.882, 'train_loss': 0.6805182123184204, 'epoch': 0.01}
+100%|| 100/100 [00:24<00:00,  4.27it/s]100%|| 100/100 [00:24<00:00,  4.03it/s]
+[INFO|trainer.py:4309] 2025-10-22 16:54:17,427 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+[INFO|configuration_utils.py:765] 2025-10-22 16:54:17,506 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:54:17,507 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -4293,44 +2200,43 @@ Training completed. Do not forget to share your model on huggingface.co/models =
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:21,136 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:21,141 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:21,146 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:54:17,621 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:54:17,625 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:54:17,629 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
 ***** train metrics *****
   epoch                    =     0.0082
   total_flos               =  1473847GF
   train_loss               =     0.6805
-  train_runtime            = 0:00:25.81
-  train_samples_per_second =     15.496
-  train_steps_per_second   =      3.874
-[INFO|modelcard.py:456] 2025-10-22 16:12:21,372 >> Dropping the following result as it does not have all the necessary fields:
+  train_runtime            = 0:00:25.76
+  train_samples_per_second =     15.528
+  train_steps_per_second   =      3.882
+[INFO|modelcard.py:456] 2025-10-22 16:54:17,771 >> Dropping the following result as it does not have all the necessary fields:
 {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
-gl064:2373550:2373550 [1] NCCL INFO comm 0x123b0010 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
-gl064:2373549:2373549 [0] NCCL INFO comm 0x156177e0 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
+gl064:2389574:2389574 [1] NCCL INFO comm 0x1230b120 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
+gl064:2389573:2389573 [0] NCCL INFO comm 0x13ad6a40 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
 [1;34mwandb[0m: 
 [1;34mwandb[0m:  View run [33minteractive_test[0m at: [34m[0m
-[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_161155-mev7yv4q/logs[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_165351-ytl2gm77/logs[0m
 
 ========================================
 Training completed successfully
-End Time: Wed Oct 22 04:12:23 PM EDT 2025
+End Time: Wed Oct 22 04:54:19 PM EDT 2025
 ========================================
 
 ========================================
 STAGE 2: Merging/Exporting Model
-Start Time: Wed Oct 22 04:12:23 PM EDT 2025
+Start Time: Wed Oct 22 04:54:19 PM EDT 2025
 ========================================
 Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
 Analyzing checkpoints to find the one from current training run...
-  - checkpoint-100: trainer_state.json modified at Wed Oct 22 04:12:20 PM EDT 2025
-  - checkpoint-150: trainer_state.json modified at Wed Oct 22 04:02:30 PM EDT 2025
-  - checkpoint-50: trainer_state.json modified at Wed Oct 22 04:12:09 PM EDT 2025
+  - checkpoint-100: trainer_state.json modified at Wed Oct 22 04:54:17 PM EDT 2025
+  - checkpoint-50: trainer_state.json modified at Wed Oct 22 04:54:05 PM EDT 2025
 
 Selected checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
 This checkpoint has the most recently updated trainer_state.json
 Checkpoint details:
   Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
-  Last modified: 2025-10-22 16:02:17.627741631 -0400
+  Last modified: 2025-10-22 16:54:17.414188691 -0400
   Training step: 100
 Updating merge config to point to checkpoint...
 Successfully updated merge config
@@ -4349,16 +2255,16 @@ Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/e
   warnings.warn(
 /scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
   import pkg_resources
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:12:32,816 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:12:33,035 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:12:33,036 >> Model config Qwen2Config {
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:27,566 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:54:27,740 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:54:27,948 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:54:27,949 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -4414,16 +2320,16 @@ Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/e
   "vocab_size": 151936
 }
 
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:12:33,289 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:12:33,337 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:12:33,338 >> Model config Qwen2Config {
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:54:28,013 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:54:28,179 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:54:28,230 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:54:28,230 >> Model config Qwen2Config {
   "architectures": [
     "Qwen2ForCausalLM"
   ],
@@ -4479,40 +2385,40 @@ Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/e
   "vocab_size": 151936
 }
 
-[WARNING|logging.py:328] 2025-10-22 16:12:33,338 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|2025-10-22 16:12:33] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
-[WARNING|logging.py:328] 2025-10-22 16:12:33,651 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:12:33,651 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:12:33,652 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:12:33,653 >> Generate config GenerationConfig {
+[WARNING|logging.py:328] 2025-10-22 16:54:28,231 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|2025-10-22 16:54:28] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
+[WARNING|logging.py:328] 2025-10-22 16:54:28,587 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1172] 2025-10-22 16:54:28,588 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
+[INFO|modeling_utils.py:2341] 2025-10-22 16:54:28,589 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
+[INFO|configuration_utils.py:986] 2025-10-22 16:54:28,589 >> Generate config GenerationConfig {
   "bos_token_id": 151643,
   "eos_token_id": 151643
 }
 
-[INFO|configuration_utils.py:941] 2025-10-22 16:12:33,738 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:12:33,739 >> Generate config GenerationConfig {
+[INFO|configuration_utils.py:941] 2025-10-22 16:54:28,673 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-22 16:54:28,673 >> Generate config GenerationConfig {
   "bos_token_id": 151643,
   "eos_token_id": 151643,
   "max_new_tokens": 2048
 }
 
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:12:33,767 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
-[INFO|2025-10-22 16:12:33] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
-[INFO|2025-10-22 16:12:34] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
-[INFO|2025-10-22 16:12:34] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
-[INFO|2025-10-22 16:12:34] llamafactory.model.loader:143 >> all params: 494,032,768
-[INFO|2025-10-22 16:12:34] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
-[INFO|configuration_utils.py:491] 2025-10-22 16:12:34,577 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
-[INFO|configuration_utils.py:757] 2025-10-22 16:12:34,582 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
-[INFO|modeling_utils.py:4181] 2025-10-22 16:12:36,078 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
-[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:36,082 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
-[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:36,087 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:36,092 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
-[INFO|2025-10-22 16:12:36] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
+[INFO|dynamic_module_utils.py:423] 2025-10-22 16:54:28,707 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
+[INFO|2025-10-22 16:54:28] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-10-22 16:54:29] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
+[INFO|2025-10-22 16:54:29] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+[INFO|2025-10-22 16:54:29] llamafactory.model.loader:143 >> all params: 494,032,768
+[INFO|2025-10-22 16:54:29] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
+[INFO|configuration_utils.py:491] 2025-10-22 16:54:29,596 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
+[INFO|configuration_utils.py:757] 2025-10-22 16:54:29,600 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
+[INFO|modeling_utils.py:4181] 2025-10-22 16:54:31,119 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:54:31,123 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:54:31,128 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:54:31,132 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
+[INFO|2025-10-22 16:54:31] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
 
 ========================================
 Merge/Export completed successfully
-End Time: Wed Oct 22 04:12:37 PM EDT 2025
+End Time: Wed Oct 22 04:54:31 PM EDT 2025
 ========================================
 
 ========================================