diff --git "a/training_artifacts/logs/pipeline_cleaned.txt" "b/training_artifacts/logs/pipeline_cleaned.txt"
--- "a/training_artifacts/logs/pipeline_cleaned.txt"
+++ "b/training_artifacts/logs/pipeline_cleaned.txt"
@@ -2474,6 +2474,10 @@ LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
Starting distributed training with torch.distributed.run...
+
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
warnings.warn(
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
@@ -2482,19 +2486,19 @@ Starting distributed training with torch.distributed.run...
import pkg_resources
/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
import pkg_resources
-[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
-[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,323 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:08:34,495 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-[INFO|configuration_utils.py:765] 2025-10-22 16:08:34,697 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:08:34,698 >> Model config Qwen2Config {
+[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:423 >> Process rank: 2, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
+[INFO|2025-10-22 16:08:34] llamafactory.hparams.parser:423 >> Process rank: 3, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,348 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:08:34,527 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:08:34,718 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:08:34,719 >> Model config Qwen2Config {
"architectures": [
"Qwen2ForCausalLM"
],
@@ -2550,88 +2554,82 @@ Starting distributed training with torch.distributed.run...
"vocab_size": 151936
}
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file added_tokens.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file special_tokens_map.json from cache at None
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,765 >> loading file chat_template.jinja from cache at None
-[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:08:34,936 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:08:34,783 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:08:34,958 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[INFO|2025-10-22 16:08:34] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
-/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4876: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
- warnings.warn( # warn only once
-[rank0]:[W1022 16:08:35.101503344 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
-gl064:2371875:2371875 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2371875:2371875 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
-gl064:2371875:2371875 [0] NCCL INFO cudaDriverVersion 13000
-gl064:2371875:2371875 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl064:2371875:2371875 [0] NCCL INFO Comm config Blocking set to 1
-gl064:2371876:2371876 [1] NCCL INFO cudaDriverVersion 13000
-gl064:2371876:2371876 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2371876:2371876 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
-gl064:2371876:2371876 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
-gl064:2371876:2371876 [1] NCCL INFO Comm config Blocking set to 1
-gl064:2371875:2371942 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
-gl064:2371875:2371942 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl064:2371875:2371942 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2371875:2371942 [0] NCCL INFO NCCL_IB_HCA set to mlx5
-gl064:2371876:2371943 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
-gl064:2371876:2371943 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
-gl064:2371876:2371943 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
-gl064:2371876:2371943 [1] NCCL INFO NCCL_IB_HCA set to mlx5
-gl064:2371875:2371942 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
-gl064:2371875:2371942 [0] NCCL INFO Initialized NET plugin IB
-gl064:2371875:2371942 [0] NCCL INFO Assigned NET plugin IB to comm
-gl064:2371875:2371942 [0] NCCL INFO Using network IB
-gl064:2371875:2371942 [0] NCCL INFO ncclCommInitRankConfig comm 0x132ed7c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xf1d8741eaa5d2206 - Init START
-gl064:2371876:2371943 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
-gl064:2371876:2371943 [1] NCCL INFO Initialized NET plugin IB
-gl064:2371876:2371943 [1] NCCL INFO Assigned NET plugin IB to comm
-gl064:2371876:2371943 [1] NCCL INFO Using network IB
-gl064:2371876:2371943 [1] NCCL INFO ncclCommInitRankConfig comm 0x14ae6de0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xf1d8741eaa5d2206 - Init START
-gl064:2371875:2371942 [0] NCCL INFO RAS client listening socket at ::1<28028>
-gl064:2371876:2371943 [1] NCCL INFO RAS client listening socket at ::1<28028>
-gl064:2371875:2371942 [0] NCCL INFO Bootstrap timings total 0.018468 (create 0.000023, send 0.000185, recv 0.003363, ring 0.000496, delay 0.000000)
-gl064:2371876:2371943 [1] NCCL INFO Bootstrap timings total 0.015250 (create 0.000022, send 0.000071, recv 0.008653, ring 0.005848, delay 0.000000)
-gl064:2371876:2371943 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
-gl064:2371875:2371942 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
-gl064:2371876:2371943 [1] NCCL INFO comm 0x14ae6de0 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
-gl064:2371875:2371942 [0] NCCL INFO comm 0x132ed7c0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
-gl064:2371876:2371943 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
-gl064:2371876:2371943 [1] NCCL INFO P2P Chunksize set to 131072
-gl064:2371875:2371942 [0] NCCL INFO Channel 00/02 : 0 1 2 3
-gl064:2371875:2371942 [0] NCCL INFO Channel 01/02 : 0 1 2 3
-gl064:2371875:2371942 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
-gl064:2371875:2371942 [0] NCCL INFO P2P Chunksize set to 131072
-gl064:2371876:2371943 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
-gl064:2371875:2371942 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
-gl064:2371875:2371942 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
-gl064:2371876:2371948 [1] NCCL INFO [Proxy Service] Device 1 CPU core 13
-gl064:2371876:2371950 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 4
-gl064:2371875:2371949 [0] NCCL INFO [Proxy Service] Device 0 CPU core 14
-gl064:2371875:2371951 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 5
-gl064:2371876:2371943 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl064:2371876:2371943 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl064:2371875:2371942 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
-gl064:2371875:2371942 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
-gl064:2371875:2371942 [0] NCCL INFO CC Off, workFifoBytes 1048576
-gl064:2371876:2371943 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl064:2371876:2371943 [1] NCCL INFO ncclCommInitRankConfig comm 0x14ae6de0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xf1d8741eaa5d2206 - Init COMPLETE
-gl064:2371875:2371942 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
-gl064:2371876:2371943 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.15 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.02, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl064:2371875:2371942 [0] NCCL INFO ncclCommInitRankConfig comm 0x132ed7c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xf1d8741eaa5d2206 - Init COMPLETE
-gl064:2371875:2371942 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.15 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.02, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
-gl064:2371875:2371952 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
-gl064:2371875:2371954 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 7
-gl064:2371875:2371952 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
-gl064:2371875:2371952 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
-gl064:2371875:2371952 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
-gl064:2371876:2371953 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
-gl064:2371876:2371953 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
-gl064:2371876:2371955 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 11
-gl064:2371875:2371952 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
-gl064:2371876:2371953 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+gl065:3757610:3757610 [1] NCCL INFO cudaDriverVersion 13000
+[rank2]:[W1022 16:08:35.111666688 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
+gl065:3757610:3757610 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3757610:3757610 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
+gl065:3757610:3757610 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl065:3757610:3757610 [1] NCCL INFO Comm config Blocking set to 1
+gl065:3757609:3757609 [0] NCCL INFO cudaDriverVersion 13000
+gl065:3757609:3757609 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3757609:3757609 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.1<0>
+gl065:3757609:3757609 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl065:3757609:3757609 [0] NCCL INFO Comm config Blocking set to 1
+gl065:3757610:3757745 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
+gl065:3757610:3757745 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl065:3757610:3757745 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3757610:3757745 [1] NCCL INFO NCCL_IB_HCA set to mlx5
+gl065:3757609:3757746 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
+gl065:3757609:3757746 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl065:3757610:3757745 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
+gl065:3757610:3757745 [1] NCCL INFO Initialized NET plugin IB
+gl065:3757609:3757746 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl065:3757609:3757746 [0] NCCL INFO NCCL_IB_HCA set to mlx5
+gl065:3757610:3757745 [1] NCCL INFO Assigned NET plugin IB to comm
+gl065:3757610:3757745 [1] NCCL INFO Using network IB
+gl065:3757610:3757745 [1] NCCL INFO ncclCommInitRankConfig comm 0x15ee2290 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xf1d8741eaa5d2206 - Init START
+gl065:3757609:3757746 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.1<0>
+gl065:3757609:3757746 [0] NCCL INFO Initialized NET plugin IB
+gl065:3757609:3757746 [0] NCCL INFO Assigned NET plugin IB to comm
+gl065:3757609:3757746 [0] NCCL INFO Using network IB
+gl065:3757609:3757746 [0] NCCL INFO ncclCommInitRankConfig comm 0x13a861a0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xf1d8741eaa5d2206 - Init START
+gl065:3757609:3757746 [0] NCCL INFO RAS client listening socket at ::1<28028>
+gl065:3757610:3757745 [1] NCCL INFO RAS client listening socket at ::1<28028>
+gl065:3757610:3757745 [1] NCCL INFO Bootstrap timings total 0.014799 (create 0.000048, send 0.000601, recv 0.001295, ring 0.005397, delay 0.000000)
+gl065:3757609:3757746 [0] NCCL INFO Bootstrap timings total 0.018568 (create 0.000023, send 0.000388, recv 0.000964, ring 0.005735, delay 0.000000)
+gl065:3757609:3757746 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
+gl065:3757610:3757745 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
+gl065:3757610:3757745 [1] NCCL INFO comm 0x15ee2290 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
+gl065:3757609:3757746 [0] NCCL INFO comm 0x13a861a0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
+gl065:3757610:3757745 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
+gl065:3757610:3757745 [1] NCCL INFO P2P Chunksize set to 131072
+gl065:3757609:3757746 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
+gl065:3757609:3757746 [0] NCCL INFO P2P Chunksize set to 131072
+gl065:3757610:3757745 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
+gl065:3757609:3757746 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
+gl065:3757610:3757751 [1] NCCL INFO [Proxy Service] Device 1 CPU core 11
+gl065:3757610:3757753 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 13
+gl065:3757609:3757754 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 14
+gl065:3757609:3757752 [0] NCCL INFO [Proxy Service] Device 0 CPU core 12
+gl065:3757609:3757746 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl065:3757609:3757746 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl065:3757610:3757745 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl065:3757610:3757745 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl065:3757609:3757746 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl065:3757609:3757746 [0] NCCL INFO ncclCommInitRankConfig comm 0x13a861a0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0xf1d8741eaa5d2206 - Init COMPLETE
+gl065:3757610:3757745 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl065:3757609:3757746 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.14 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
+gl065:3757610:3757745 [1] NCCL INFO ncclCommInitRankConfig comm 0x15ee2290 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0xf1d8741eaa5d2206 - Init COMPLETE
+gl065:3757610:3757745 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.15 (kernels 0.09, alloc 0.01, bootstrap 0.01, allgathers 0.02, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
+gl065:3757609:3757756 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/IB/0
+gl065:3757609:3757756 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/IB/0
+gl065:3757609:3757757 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 9
+gl065:3757609:3757756 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
+gl065:3757609:3757756 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
+gl065:3757610:3757755 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/IB/0
+gl065:3757610:3757755 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/IB/0
+gl065:3757610:3757758 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 2
+gl065:3757610:3757755 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+gl065:3757609:3757756 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
training example:
input_ids:
[33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
@@ -2883,8 +2881,8 @@ Hence, the correct answer is:
(67 + 31) + 71
<|endoftext|>
-[INFO|configuration_utils.py:765] 2025-10-22 16:08:36,199 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
-[INFO|configuration_utils.py:839] 2025-10-22 16:08:36,200 >> Model config Qwen2Config {
+[INFO|configuration_utils.py:765] 2025-10-22 16:08:36,172 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:08:36,173 >> Model config Qwen2Config {
"architectures": [
"Qwen2ForCausalLM"
],
@@ -2941,41 +2939,45 @@ Hence, the correct answer is:
}
[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
-[WARNING|logging.py:328] 2025-10-22 16:08:36,524 >> `torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|modeling_utils.py:1172] 2025-10-22 16:08:36,525 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
-[INFO|modeling_utils.py:2341] 2025-10-22 16:08:36,526 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
-[INFO|configuration_utils.py:986] 2025-10-22 16:08:36,527 >> Generate config GenerationConfig {
+[WARNING|logging.py:328] 2025-10-22 16:08:36,500 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1172] 2025-10-22 16:08:36,502 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
+[INFO|modeling_utils.py:2341] 2025-10-22 16:08:36,502 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
+[INFO|configuration_utils.py:986] 2025-10-22 16:08:36,503 >> Generate config GenerationConfig {
"bos_token_id": 151643,
"eos_token_id": 151643,
"use_cache": false
}
`torch_dtype` is deprecated! Use `dtype` instead!
-[INFO|configuration_utils.py:941] 2025-10-22 16:08:36,796 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
-[INFO|configuration_utils.py:986] 2025-10-22 16:08:36,797 >> Generate config GenerationConfig {
+[INFO|configuration_utils.py:941] 2025-10-22 16:08:36,753 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-22 16:08:36,754 >> Generate config GenerationConfig {
"bos_token_id": 151643,
"eos_token_id": 151643,
"max_new_tokens": 2048
}
-[INFO|dynamic_module_utils.py:423] 2025-10-22 16:08:36,825 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
+[INFO|dynamic_module_utils.py:423] 2025-10-22 16:08:36,785 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
[INFO|2025-10-22 16:08:36] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
[INFO|2025-10-22 16:08:36] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
-[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.misc:143 >> Found linear modules: gate_proj,k_proj,q_proj,o_proj,v_proj,up_proj,down_proj
+[INFO|2025-10-22 16:08:36] llamafactory.model.model_utils.misc:143 >> Found linear modules: o_proj,down_proj,v_proj,up_proj,gate_proj,q_proj,k_proj
[INFO|2025-10-22 16:08:37] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
-[WARNING|trainer.py:906] 2025-10-22 16:08:37,075 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
-[INFO|trainer.py:699] 2025-10-22 16:08:37,077 >> max_steps is given, it will override any value given in num_train_epochs
-[INFO|trainer.py:749] 2025-10-22 16:08:37,077 >> Using auto half precision backend
-[WARNING|2025-10-22 16:08:37] llamafactory.train.callbacks:154 >> Previous trainer log in this folder will be deleted.
-[WARNING|trainer.py:982] 2025-10-22 16:08:37,081 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
+[WARNING|trainer.py:906] 2025-10-22 16:08:37,029 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
+[INFO|trainer.py:699] 2025-10-22 16:08:37,031 >> max_steps is given, it will override any value given in num_train_epochs
+[INFO|trainer.py:749] 2025-10-22 16:08:37,031 >> Using auto half precision backend
+[WARNING|trainer.py:982] 2025-10-22 16:08:37,032 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
-[INFO|trainer.py:2519] 2025-10-22 16:08:37,579 >> ***** Running training *****
-[INFO|trainer.py:2520] 2025-10-22 16:08:37,579 >> Num examples = 48,600
-[INFO|trainer.py:2521] 2025-10-22 16:08:37,579 >> Num Epochs = 1
-[INFO|trainer.py:2522] 2025-10-22 16:08:37,579 >> Instantaneous batch size per device = 1
+[INFO|trainer.py:2519] 2025-10-22 16:08:37,580 >> ***** Running training *****
+[INFO|trainer.py:2520] 2025-10-22 16:08:37,580 >> Num examples = 48,600
+[INFO|trainer.py:2521] 2025-10-22 16:08:37,580 >> Num Epochs = 1
+[INFO|trainer.py:2522] 2025-10-22 16:08:37,581 >> Instantaneous batch size per device = 1
+[INFO|trainer.py:2525] 2025-10-22 16:08:37,581 >> Total train batch size (w. parallel, distributed & accumulation) = 4
+[INFO|trainer.py:2526] 2025-10-22 16:08:37,581 >> Gradient Accumulation steps = 1
+[INFO|trainer.py:2527] 2025-10-22 16:08:37,581 >> Total optimization steps = 100
+[INFO|trainer.py:2528] 2025-10-22 16:08:37,582 >> Number of trainable parameters = 4,399,104
+vice = 1
[INFO|trainer.py:2525] 2025-10-22 16:08:37,579 >> Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2526] 2025-10-22 16:08:37,579 >> Gradient Accumulation steps = 1
[INFO|trainer.py:2527] 2025-10-22 16:08:37,579 >> Total optimization steps = 100
@@ -2992,8 +2994,14 @@ wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/4de4rspj
10%| | 10/100 [00:02<00:20, 4.47it/s] 11%| | 11/100 [00:02<00:19, 4.50it/s] 12%| | 12/100 [00:03<00:26, 3.33it/s] 13%| | 13/100 [00:03<00:22, 3.92it/s] 14%| | 14/100 [00:03<00:19, 4.46it/s] 15%| | 15/100 [00:03<00:17, 4.80it/s] 16%| | 16/100 [00:04<00:23, 3.60it/s] 17%| | 17/100 [00:04<00:22, 3.63it/s] 18%| | 18/100 [00:04<00:22, 3.69it/s] 19%| | 19/100 [00:05<00:22, 3.68it/s] 20%| | 20/100 [00:05<00:20, 3.97it/s] {'loss': 0.7526, 'grad_norm': 0.3976583480834961, 'learning_rate': 4.05e-05, 'epoch': 0.0}
20%| | 20/100 [00:05<00:20, 3.97it/s] 21%| | 21/100 [00:05<00:20, 3.93it/s] 22%| | 22/100 [00:05<00:18, 4.17it/s] 23%| | 23/100 [00:05<00:18, 4.05it/s] 24%| | 24/100 [00:06<00:17, 4.38it/s] 25%| | 25/100 [00:06<00:18, 4.01it/s] 26%| | 26/100 [00:06<00:17, 4.16it/s] 27%| | 27/100 [00:06<00:17, 4.08it/s] 28%| | 28/100 [00:07<00:19, 3.65it/s] 29%| | 29/100 [00:07<00:18, 3.75it/s] 30%| | 30/100 [00:07<00:18, 3.85it/s] {'loss': 0.7383, 'grad_norm': 0.465567946434021, 'learning_rate': 3.55e-05, 'epoch': 0.0}
30%| | 30/100 [00:07<00:18, 3.85it/s] 31%| | 31/100 [00:08<00:18, 3.83it/s] 32%| | 32/100 [00:08<00:16, 4.05it/s] 33%| | 33/100 [00:08<00:15, 4.44it/s] 34%| | 34/100 [00:08<00:13, 4.87it/s] 35%| | 35/100 [00:08<00:14, 4.57it/s] 36%| | 36/100 [00:09<00:12, 5.02it/s] 37%| | 37/100 [00:09<00:13, 4.69it/s] 38%| | 38/100 [00:09<00:13, 4.65it/s] 39%| | 39/100 [00:09<00:12, 4.97it/s] 40%| | 40/100 [00:09<00:13, 4.57it/s] {'loss': 0.7139, 'grad_norm': 0.3747170865535736, 'learning_rate': 3.05e-05, 'epoch': 0.0}
- 40%| | 40/100 [00:09<00:13, 4.57it/s] 41%| | 41/100 [00:10<00:14, 4.09it/s] 42%| | 42/100 [00:10<00:14, 4.07it/s] 43%| | 43/100 [00:10<00:12, 4.57it/s] 44%| | 44/100 [00:10<00:11, 4.89it/s] 45%| | 45/100 [00:10<00:10, 5.31it/s] 46%| | 46/100 [00:11<00:10, 4.97it/s] 47%| | 47/100 [00:11<00:10, 4.87it/s] 48%| | 48/100 [00:11<00:10, 5.13it/s] 49%| | 49/100 [00:11<00:11, 4.49it/s] 50%| | 50/100 [00:12<00:11, 4.48it/s] {'loss': 0.6497, 'grad_norm': 0.5903568267822266, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.0}
- 50%| | 50/100 [00:12<00:11, 4.48it/s][INFO|trainer.py:4309] 2025-10-22 16:08:50,960 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
+ 40%| | 40/100 [00:09<00:13, 4.57it/s] 41%| | 41/100 [00:10<00:14, 4.09it/s] 42%| | 42/100 [00:10<00:14, 4.07it/s] 43%| | 43/100 [00:10<00:12, 4.57it/s] 44%| | 44/100 [00:10<00:11, 4.89it/s] 45%| | 45/100 [00:10<00:10, 5.31it/s] 46%| | 46/100 [00:11<00:10, 4.97it/s] 47%| | 47/100 [00:11<00:10, 4.87it/s] 48%| | 48/100 [00:11<00:10, 5.13it/s][INFO|trainer.py:2810] 2025-10-22 16:09:02,748 >>
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+gl065:3757610:3757610 [1] NCCL INFO comm 0x15ee2290 rank 3 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
+gl065:3757609:3757609 [0] NCCL INFO comm 0x13a861a0 rank 2 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
+ 2025-10-22 16:08:50,960 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
[INFO|configuration_utils.py:765] 2025-10-22 16:08:51,135 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
[INFO|configuration_utils.py:839] 2025-10-22 16:08:51,137 >> Model config Qwen2Config {
"architectures": [
@@ -3409,3 +3417,1106 @@ Preparing Training Artifacts
========================================
Copying configuration files...
Copying and cleaning training logs...
+Training artifacts prepared in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/training_artifacts
+Contents:
+Log files:
+
+========================================
+STAGE 3: Uploading to HuggingFace Hub
+Repository: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
+Start Time: Wed Oct 22 04:09:18 PM EDT 2025
+========================================
+Uploading contents of: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+Directory structure:
+
+Executing: huggingface-cli upload TAUR-dev/testing_llamafactory_helper_quick_test__interactive /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged .
+Start hashing 17 files.
+Finished hashing 17 files.
+[33m Warning: 'huggingface-cli upload' is deprecated. Use 'hf upload' instead.[0m
+Processing Files (0 / 0) : | | 0.00B / 0.00B
+New Data Upload : | | 0.00B / 0.00B [A
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 11%| | 109MB / 988MB [A[A[A
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 11%| | 109MB / 988MB [A[A[AProcessing Files (1 / 2) : 12%| | 120MB / 1.00GB, ???B/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 24%| | 235MB / 988MB [A[A[AProcessing Files (1 / 2) : 25%| | 246MB / 1.00GB, 628MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 36%| | 352MB / 988MB [A[A[AProcessing Files (1 / 2) : 36%| | 364MB / 1.00GB, 609MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 48%| | 478MB / 988MB [A[A[AProcessing Files (1 / 2) : 49%| | 490MB / 1.00GB, 615MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 60%| | 596MB / 988MB [A[A[AProcessing Files (1 / 2) : 61%| | 607MB / 1.00GB, 608MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 73%| | 721MB / 988MB [A[A[AProcessing Files (1 / 2) : 73%| | 733MB / 1.00GB, 612MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 86%| | 847MB / 988MB [A[A[AProcessing Files (1 / 2) : 86%| | 859MB / 1.00GB, 615MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 98%|| 965MB / 988MB [A[A[AProcessing Files (1 / 2) : 98%|| 976MB / 1.00GB, 611MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (2 / 2) : 100%|| 1.00GB / 1.00GB, 549MB/s
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[A
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[A
+
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB [A[A
+
+
+ .../merged/model.safetensors: 100%|| 988MB / 988MB [A[A[AProcessing Files (2 / 2) : 100%|| 1.00GB / 1.00GB, 440MB/s
+New Data Upload : | | 0.00B / 0.00B, 0.00B/s
+ ...ive/merged/tokenizer.json: 100%|| 11.4MB / 11.4MB
+ .../merged/model.safetensors: 100%|| 988MB / 988MB
+Removing 14 file(s) from commit that have not changed.
+https://huggingface.co/TAUR-dev/testing_llamafactory_helper_quick_test__interactive/tree/main/.
+
+========================================
+Upload completed successfully
+Model and training artifacts uploaded to: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
+End Time: Wed Oct 22 04:09:24 PM EDT 2025
+========================================
+
+========================================
+STAGE 4: Cleanup
+========================================
+Keeping checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Keeping merged model in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+
+========================================
+PIPELINE COMPLETED SUCCESSFULLY
+End Time: Wed Oct 22 04:09:24 PM EDT 2025
+========================================
+
+========================================
+Cleaning up LlamaFactory processes
+========================================
+Cleaned up processes on gl064.hpc.nyu.edu
+Cleaning up processes on worker node: gl065
+Process cleanup complete
+========================================
+Job Name: lf_torch_test__interactive
+Hostname: gl064.hpc.nyu.edu
+Number of nodes: 2
+GPUs per node: 2
+Start Time: Wed Oct 22 04:11:30 PM EDT 2025
+Log file: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/logs/pipeline.log
+========================================
+Sourcing secrets from: /scratch/zrs2020/LlamaFactoryHelper/secrets.env
+
+========================================
+Configuration Paths
+========================================
+Train Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+Merge Config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+Dataset Info:
+Output Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Export Dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+HF Repo ID: TAUR-dev/testing_llamafactory_helper_quick_test__interactive
+
+
+========================================
+Multi-Node Coordination
+========================================
+This is the master node - coordinating worker nodes...
+Master node: gl064
+Master port: 29500
+World size: 2
+
+Launching on worker node 1: gl065
+All worker nodes launched successfully
+Master node (this node) will now join training as rank 0
+
+
+========================================
+STAGE 1: Training Model
+Start Time: Wed Oct 22 04:11:33 PM EDT 2025
+========================================
+Multi-node training detected
+Nodes: 2, GPUs per node: 2
+Master address: gl064
+Master port: 29500
+Node rank: 0
+World size: 2
+CUDA_VISIBLE_DEVICES: 0,1
+LLaMA-Factory path: /scratch/zrs2020/LlamaFactoryHelper/LLaMA-Factory
+Training config: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/train_config.yaml
+
+Starting distributed training with torch.distributed.run...
+
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+ warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+ warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+ import pkg_resources
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+ import pkg_resources
+[INFO|2025-10-22 16:11:51] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
+[INFO|2025-10-22 16:11:51] llamafactory.hparams.parser:423 >> Process rank: 0, world size: 4, device: cuda:0, distributed training: True, compute dtype: torch.float16
+[INFO|2025-10-22 16:11:51] llamafactory.hparams.parser:423 >> Process rank: 1, world size: 4, device: cuda:1, distributed training: True, compute dtype: torch.float16
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:51,908 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:11:52,077 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:11:52,302 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:11:52,304 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,369 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:11:52,370 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:11:52,534 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|2025-10-22 16:11:52] llamafactory.data.loader:143 >> Loading dataset TAUR-dev/D-SFT_C-sft_exp_AT_pvv2__fixed-sft-data...
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/torch/distributed/distributed_c10d.py:4876: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
+ warnings.warn( # warn only once
+[rank0]:[W1022 16:11:53.769183387 ProcessGroupNCCL.cpp:5068] Guessing device ID based on global rank. This can cause a hang if rank to GPU mapping is heterogeneous. You can specify device_id in init_process_group()
+gl064:2373549:2373549 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2373549:2373549 [0] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
+gl064:2373549:2373549 [0] NCCL INFO cudaDriverVersion 13000
+gl064:2373549:2373549 [0] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl064:2373549:2373549 [0] NCCL INFO Comm config Blocking set to 1
+gl064:2373550:2373550 [1] NCCL INFO cudaDriverVersion 13000
+gl064:2373550:2373550 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2373550:2373550 [1] NCCL INFO Bootstrap: Using ibs3:10.0.5.0<0>
+gl064:2373550:2373550 [1] NCCL INFO NCCL version 2.27.5+cuda12.9
+gl064:2373550:2373550 [1] NCCL INFO Comm config Blocking set to 1
+gl064:2373549:2373629 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
+gl064:2373549:2373629 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl064:2373549:2373629 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2373549:2373629 [0] NCCL INFO NCCL_IB_HCA set to mlx5
+gl064:2373550:2373630 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
+gl064:2373550:2373630 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
+gl064:2373550:2373630 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ibs
+gl064:2373550:2373630 [1] NCCL INFO NCCL_IB_HCA set to mlx5
+gl064:2373549:2373629 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
+gl064:2373549:2373629 [0] NCCL INFO Initialized NET plugin IB
+gl064:2373549:2373629 [0] NCCL INFO Assigned NET plugin IB to comm
+gl064:2373549:2373629 [0] NCCL INFO Using network IB
+gl064:2373549:2373629 [0] NCCL INFO ncclCommInitRankConfig comm 0x156177e0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x94e527c6b9214f3c - Init START
+gl064:2373550:2373630 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [RO]; OOB ibs3:10.0.5.0<0>
+gl064:2373550:2373630 [1] NCCL INFO Initialized NET plugin IB
+gl064:2373550:2373630 [1] NCCL INFO Assigned NET plugin IB to comm
+gl064:2373550:2373630 [1] NCCL INFO Using network IB
+gl064:2373550:2373630 [1] NCCL INFO ncclCommInitRankConfig comm 0x123b0010 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x94e527c6b9214f3c - Init START
+gl064:2373550:2373630 [1] NCCL INFO RAS client listening socket at ::1<28028>
+gl064:2373549:2373629 [0] NCCL INFO RAS client listening socket at ::1<28028>
+gl064:2373550:2373630 [1] NCCL INFO Bootstrap timings total 0.008614 (create 0.000021, send 0.000072, recv 0.006619, ring 0.001056, delay 0.000000)
+gl064:2373549:2373629 [0] NCCL INFO Bootstrap timings total 0.019798 (create 0.000024, send 0.000210, recv 0.001659, ring 0.000908, delay 0.000000)
+gl064:2373550:2373630 [1] NCCL INFO Setting affinity for GPU 1 to 0-15
+gl064:2373549:2373629 [0] NCCL INFO Setting affinity for GPU 0 to 0-15
+gl064:2373549:2373629 [0] NCCL INFO comm 0x156177e0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
+gl064:2373550:2373630 [1] NCCL INFO comm 0x123b0010 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
+gl064:2373549:2373629 [0] NCCL INFO Channel 00/02 : 0 1 2 3
+gl064:2373549:2373629 [0] NCCL INFO Channel 01/02 : 0 1 2 3
+gl064:2373550:2373630 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
+gl064:2373549:2373629 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
+gl064:2373549:2373629 [0] NCCL INFO P2P Chunksize set to 131072
+gl064:2373550:2373630 [1] NCCL INFO P2P Chunksize set to 131072
+gl064:2373550:2373630 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
+gl064:2373549:2373629 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
+gl064:2373549:2373629 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
+gl064:2373550:2373635 [1] NCCL INFO [Proxy Service] Device 1 CPU core 10
+gl064:2373549:2373636 [0] NCCL INFO [Proxy Service] Device 0 CPU core 11
+gl064:2373550:2373637 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 12
+gl064:2373549:2373638 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 13
+gl064:2373550:2373630 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl064:2373550:2373630 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl064:2373549:2373629 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+gl064:2373549:2373629 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+gl064:2373549:2373629 [0] NCCL INFO CC Off, workFifoBytes 1048576
+gl064:2373550:2373630 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl064:2373550:2373630 [1] NCCL INFO ncclCommInitRankConfig comm 0x123b0010 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 59000 commId 0x94e527c6b9214f3c - Init COMPLETE
+gl064:2373550:2373630 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.13 (kernels 0.09, alloc 0.01, bootstrap 0.01, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
+gl064:2373549:2373629 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
+gl064:2373549:2373629 [0] NCCL INFO ncclCommInitRankConfig comm 0x156177e0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 47000 commId 0x94e527c6b9214f3c - Init COMPLETE
+gl064:2373549:2373629 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.14 (kernels 0.09, alloc 0.01, bootstrap 0.02, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
+gl064:2373549:2373640 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/IB/0
+gl064:2373549:2373641 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 0
+gl064:2373549:2373640 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/IB/0
+gl064:2373549:2373640 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
+gl064:2373549:2373640 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
+gl064:2373550:2373639 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/IB/0
+gl064:2373550:2373639 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/IB/0
+gl064:2373550:2373642 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 7
+gl064:2373549:2373640 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+gl064:2373550:2373639 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
+training example:
+input_ids:
+[33975, 25, 21806, 279, 2701, 3491, 13, 81917, 697, 32711, 3019, 553, 3019, 13, 3197, 498, 525, 8060, 11, 2968, 697, 4226, 304, 419, 3561, 25, 366, 9217, 2235, 21732, 4226, 12533, 9217, 94367, 2, 22079, 198, 16429, 279, 5109, 304, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 1125, 1855, 458, 23606, 429, 16819, 220, 16, 21, 24, 13, 1446, 646, 990, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 8, 323, 1817, 1372, 646, 1172, 387, 1483, 3055, 13, 4615, 6291, 1265, 2924, 264, 4013, 315, 7354, 330, 8304, 1599, 2974, 1380, 1817, 3019, 374, 264, 35972, 5666, 323, 279, 1590, 3019, 13653, 11508, 311, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 382, 35127, 697, 4226, 304, 279, 2701, 3561, 510, 27, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 1339, 9064, 11993, 21732, 4226, 9940, 374, 279, 1140, 315, 7354, 311, 5545, 279, 2169, 1372, 476, 432, 1265, 387, 264, 3175, 23606, 429, 3059, 304, 279, 2169, 13, 4710, 2461, 3110, 510, 2679, 279, 1140, 315, 5109, 572, 508, 16, 11, 220, 17, 11, 220, 18, 60, 323, 279, 2169, 572, 220, 16, 11, 498, 1410, 3270, 510, 27, 9217, 397, 8304, 220, 16, 25, 220, 16, 488, 220, 17, 284, 220, 18, 198, 8304, 220, 17, 25, 220, 18, 608, 220, 18, 284, 220, 16, 198, 522, 9217, 1339, 269, 4710, 27, 9217, 397, 7, 16, 488, 220, 17, 8, 608, 220, 18, 198, 522, 9217, 1339, 10061, 594, 1744, 3019, 553, 3019, 13, 151643, 198, 71703, 25, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
+inputs:
+Human: Answer the following problem. Explain your reasoning step by step. When you are finished, give your answer in this format: (your answer).
+
+# Problem
+Using the numbers in the list [67, 71, 31], create an equation that equals 169. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Your solution should include a series of steps "Step X:" where each step is a mathematical operation and the final step ultimately leads to the target number or it should be a single equation that results in the target.
+
+Give your answer in the following format:
+
+(your answer)
+
+
+Where "(your answer)" is the list of steps to reach the target number or it should be a single equation that results in the target.
+
+For example:
+If the list of numbers was [1, 2, 3] and the target was 1, you could write:
+
+Step 1: 1 + 2 = 3
+Step 2: 3 / 3 = 1
+
+
+or
+
+
+(1 + 2) / 3
+
+
+Let's think step by step.<|endoftext|>
+Assistant:
+
+First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
+Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
+Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
+Step 3: Try different combinations of these operations with the given numbers.
+Step 4: Test various equations until we find one that equals 169.
+Step 5: Once we have found the correct equation, present it in the specified format.
+After going through all these steps, here's the final equation that equals 169:
+
+
+(67 * 2) + 71 - 31
+
+
+
+Let us verify this answer:
+
+First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
+
+
+Incorrect
+
+
+Let me verify my answer.
+
+Rewritten Question:
+
+Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
+
+Step 1: Multiply 67 and 31.
+Step 2: Add 71 to the result from Step 1.
+Step 3: The final result will be 169.
+
+
+(67 * 31) + 71 = 169
+
+
+
+Let us verify this answer:
+
+First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
+
+
+Incorrect
+
+
+Time to review my response one more time.
+
+First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
+The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
+I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /).
+Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
+After finding the correct combination, I will present my solution in the "
+(your answer)
+" format.
+Here's my detailed plan:
+
+1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
+2. Evaluate each combination to see if it equals 169.
+3. If no combination equals 169, repeat the process with new combinations until I find one that works.
+4. Once I have found the correct combination, express it as a single equation in the "
+(your answer)
+" format.
+
+Now let me begin searching for the correct combination!
+
+
+Let us verify this answer:
+
+1. Step 1: 67 * 31 = 2077
+2. Step 2: 2077 - 71 = 2006
+
+Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
+
+
+Incorrect
+
+
+But wait, let me think about it again.
+
+Rewritten Question:
+
+What is the result when you add two numbers from the list [67, 71, 31] together?
+
+Step 1: Add 67 and 31.
+Step 2: Take the sum from Step 1 and add 71.
+
+Final Equation: (67 + 31) + 71
+
+
+(67 + 31) + 71
+
+
+
+Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
+
+
+Correct
+
+
+
+
+
+Hence, the correct answer is:
+
+
+(67 + 31) + 71
+<|endoftext|>
+
+label_ids:
+[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 13708, 766, 397, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 8304, 220, 16, 25, 64547, 279, 5795, 481, 1205, 1184, 311, 1855, 458, 23606, 1667, 279, 5109, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 17, 25, 21144, 3204, 7525, 481, 78646, 11, 75240, 11, 46444, 11, 323, 12804, 624, 8304, 220, 18, 25, 9735, 2155, 27459, 315, 1493, 7525, 448, 279, 2661, 5109, 624, 8304, 220, 19, 25, 3393, 5257, 37906, 3080, 582, 1477, 825, 429, 16819, 220, 16, 21, 24, 624, 8304, 220, 20, 25, 9646, 582, 614, 1730, 279, 4396, 23606, 11, 3042, 432, 304, 279, 5189, 3561, 624, 6025, 2087, 1526, 678, 1493, 7354, 11, 1588, 594, 279, 1590, 23606, 429, 16819, 220, 16, 21, 24, 1447, 27, 9217, 397, 7, 21, 22, 353, 220, 17, 8, 488, 220, 22, 16, 481, 220, 18, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 488, 220, 22, 16, 284, 220, 16, 18, 23, 11, 1221, 220, 16, 18, 23, 481, 220, 18, 16, 284, 220, 16, 15, 22, 13, 1096, 1558, 537, 6144, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 10061, 752, 10146, 847, 4226, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 10253, 279, 5109, 504, 279, 2661, 738, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 311, 1352, 458, 23606, 6144, 287, 220, 16, 21, 24, 1667, 6770, 34784, 7525, 17973, 11, 85922, 11777, 608, 3593, 8304, 220, 16, 25, 71059, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 2691, 220, 22, 16, 311, 279, 1102, 504, 14822, 220, 16, 624, 8304, 220, 18, 25, 576, 1590, 1102, 686, 387, 220, 16, 21, 24, 382, 27, 9217, 397, 7, 21, 22, 353, 220, 18, 16, 8, 488, 220, 22, 16, 284, 220, 16, 21, 24, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 5338, 11, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 11, 15, 22, 22, 11, 1221, 220, 17, 11, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 11, 15, 15, 21, 13, 1096, 374, 537, 6144, 311, 220, 16, 21, 24, 11, 773, 279, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 1462, 311, 3395, 847, 2033, 825, 803, 882, 624, 27, 13611, 397, 5338, 11, 358, 3278, 1430, 311, 3535, 279, 3491, 2664, 553, 4378, 700, 264, 3119, 323, 728, 2167, 5538, 1119, 7716, 911, 1246, 358, 1265, 11625, 419, 624, 785, 5795, 374, 311, 1855, 458, 23606, 1667, 279, 5109, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 429, 16819, 220, 16, 21, 24, 624, 40, 686, 1191, 553, 4460, 2155, 27459, 315, 1493, 2326, 5109, 323, 862, 3204, 7525, 17973, 11, 85922, 11777, 608, 568, 715, 12209, 358, 686, 15442, 1817, 10601, 323, 1779, 421, 807, 6144, 220, 16, 21, 24, 13, 1416, 537, 11, 358, 686, 3271, 389, 311, 279, 1790, 10601, 3080, 358, 1477, 825, 429, 4278, 624, 6025, 9271, 279, 4396, 10601, 11, 358, 686, 3042, 847, 6291, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 624, 8420, 594, 847, 11682, 3119, 1447, 16, 13, 4230, 678, 3204, 27459, 315, 220, 21, 22, 11, 220, 22, 16, 11, 323, 220, 18, 16, 1667, 5256, 11, 75240, 11, 46444, 11, 323, 12804, 624, 17, 13, 54115, 1817, 10601, 311, 1490, 421, 432, 16819, 220, 16, 21, 24, 624, 18, 13, 1416, 902, 10601, 16819, 220, 16, 21, 24, 11, 13153, 279, 1882, 448, 501, 27459, 3080, 358, 1477, 825, 429, 4278, 624, 19, 13, 9646, 358, 614, 1730, 279, 4396, 10601, 11, 3158, 432, 438, 264, 3175, 23606, 304, 279, 4055, 9217, 397, 7021, 413, 4226, 340, 522, 9217, 9877, 3561, 382, 7039, 1077, 752, 3161, 15039, 369, 279, 4396, 10601, 4894, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 1447, 16, 13, 14822, 220, 16, 25, 220, 21, 22, 353, 220, 18, 16, 284, 220, 17, 15, 22, 22, 198, 17, 13, 14822, 220, 17, 25, 220, 17, 15, 22, 22, 481, 220, 22, 16, 284, 220, 17, 15, 15, 21, 271, 54815, 11, 279, 1590, 1102, 374, 220, 17, 15, 15, 21, 11, 892, 1558, 537, 6144, 220, 16, 21, 24, 13, 576, 4226, 374, 15114, 382, 27, 423, 8477, 397, 40468, 198, 522, 423, 8477, 397, 522, 34913, 397, 3983, 3783, 11, 1077, 752, 1744, 911, 432, 1549, 624, 27, 13611, 397, 58465, 12153, 15846, 1447, 3838, 374, 279, 1102, 979, 498, 912, 1378, 5109, 504, 279, 1140, 508, 21, 22, 11, 220, 22, 16, 11, 220, 18, 16, 60, 3786, 1939, 8304, 220, 16, 25, 2691, 220, 21, 22, 323, 220, 18, 16, 624, 8304, 220, 17, 25, 11778, 279, 2629, 504, 14822, 220, 16, 323, 912, 220, 22, 16, 382, 19357, 84670, 25, 320, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 271, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 397, 522, 13611, 397, 27, 34913, 397, 10061, 601, 10146, 419, 4226, 25, 5512, 11, 220, 21, 22, 488, 220, 18, 16, 284, 220, 24, 23, 11, 1221, 220, 24, 23, 488, 220, 22, 16, 284, 220, 16, 21, 24, 13, 576, 4226, 374, 4396, 382, 27, 423, 8477, 397, 33092, 198, 522, 423, 8477, 397, 522, 34913, 397, 522, 26865, 10370, 39, 763, 11, 279, 4396, 4226, 374, 1447, 27, 9217, 397, 7, 21, 22, 488, 220, 18, 16, 8, 488, 220, 22, 16, 198, 522, 9217, 29, 151643, 198]
+labels:
+
+
+First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
+Step 1: Identify the goal - We need to create an equation using the numbers [67, 71, 31] that equals 169.
+Step 2: Consider possible operations - Addition, subtraction, multiplication, and division.
+Step 3: Try different combinations of these operations with the given numbers.
+Step 4: Test various equations until we find one that equals 169.
+Step 5: Once we have found the correct equation, present it in the specified format.
+After going through all these steps, here's the final equation that equals 169:
+
+
+(67 * 2) + 71 - 31
+
+
+
+Let us verify this answer:
+
+First, 67 + 71 = 138, then 138 - 31 = 107. This does not equal 169, so the answer is incorrect.
+
+
+Incorrect
+
+
+Let me verify my answer.
+
+Rewritten Question:
+
+Use the numbers from the given set [67, 71, 31] to form an equation equaling 169 using basic arithmetic operations (+, -, *, /).
+
+Step 1: Multiply 67 and 31.
+Step 2: Add 71 to the result from Step 1.
+Step 3: The final result will be 169.
+
+
+(67 * 31) + 71 = 169
+
+
+
+Let us verify this answer:
+
+First, 67 * 31 = 2,077, then 2,077 - 71 = 2,006. This is not equal to 169, so the answer is incorrect.
+
+
+Incorrect
+
+
+Time to review my response one more time.
+
+First, I'll try to understand the problem better by writing out a plan and go really deep into detail about how I should solve this.
+The goal is to create an equation using the numbers 67, 71, and 31 that equals 169.
+I will start by trying different combinations of these three numbers and their possible operations (+, -, *, /).
+Then I will evaluate each combination and check if they equal 169. If not, I will move on to the next combination until I find one that works.
+After finding the correct combination, I will present my solution in the "
+(your answer)
+" format.
+Here's my detailed plan:
+
+1. Create all possible combinations of 67, 71, and 31 using addition, subtraction, multiplication, and division.
+2. Evaluate each combination to see if it equals 169.
+3. If no combination equals 169, repeat the process with new combinations until I find one that works.
+4. Once I have found the correct combination, express it as a single equation in the "
+(your answer)
+" format.
+
+Now let me begin searching for the correct combination!
+
+
+Let us verify this answer:
+
+1. Step 1: 67 * 31 = 2077
+2. Step 2: 2077 - 71 = 2006
+
+Therefore, the final result is 2006, which does not equal 169. The answer is incorrect.
+
+
+Incorrect
+
+
+But wait, let me think about it again.
+
+Rewritten Question:
+
+What is the result when you add two numbers from the list [67, 71, 31] together?
+
+Step 1: Add 67 and 31.
+Step 2: Take the sum from Step 1 and add 71.
+
+Final Equation: (67 + 31) + 71
+
+
+(67 + 31) + 71
+
+
+
+Let us verify this answer: First, 67 + 31 = 98, then 98 + 71 = 169. The answer is correct.
+
+
+Correct
+
+
+
+
+
+Hence, the correct answer is:
+
+
+(67 + 31) + 71
+<|endoftext|>
+
+[INFO|configuration_utils.py:765] 2025-10-22 16:11:53,824 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:11:53,825 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[INFO|2025-10-22 16:11:53] llamafactory.model.model_utils.kv_cache:143 >> KV cache is disabled during training.
+[WARNING|logging.py:328] 2025-10-22 16:11:54,141 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1172] 2025-10-22 16:11:54,143 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
+[INFO|modeling_utils.py:2341] 2025-10-22 16:11:54,144 >> Instantiating Qwen2ForCausalLM model under default dtype torch.float16.
+[INFO|configuration_utils.py:986] 2025-10-22 16:11:54,145 >> Generate config GenerationConfig {
+ "bos_token_id": 151643,
+ "eos_token_id": 151643,
+ "use_cache": false
+}
+
+`torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|configuration_utils.py:941] 2025-10-22 16:11:54,648 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-22 16:11:54,648 >> Generate config GenerationConfig {
+ "bos_token_id": 151643,
+ "eos_token_id": 151643,
+ "max_new_tokens": 2048
+}
+
+[INFO|dynamic_module_utils.py:423] 2025-10-22 16:11:54,677 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
+[INFO|2025-10-22 16:11:54] llamafactory.model.model_utils.checkpointing:143 >> Gradient checkpointing enabled.
+[INFO|2025-10-22 16:11:54] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-10-22 16:11:54] llamafactory.model.adapter:143 >> Upcasting trainable params to float32.
+[INFO|2025-10-22 16:11:54] llamafactory.model.adapter:143 >> Fine-tuning method: LoRA
+[INFO|2025-10-22 16:11:54] llamafactory.model.model_utils.misc:143 >> Found linear modules: o_proj,gate_proj,k_proj,v_proj,up_proj,down_proj,q_proj
+[INFO|2025-10-22 16:11:54] llamafactory.model.loader:143 >> trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826
+The model is already on multiple devices. Skipping the move to device specified in `args`.
+The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
+[WARNING|trainer.py:906] 2025-10-22 16:11:54,922 >> The model is already on multiple devices. Skipping the move to device specified in `args`.
+[INFO|trainer.py:699] 2025-10-22 16:11:54,925 >> max_steps is given, it will override any value given in num_train_epochs
+[INFO|trainer.py:749] 2025-10-22 16:11:54,925 >> Using auto half precision backend
+[WARNING|2025-10-22 16:11:54] llamafactory.train.callbacks:154 >> Previous trainer log in this folder will be deleted.
+[WARNING|trainer.py:982] 2025-10-22 16:11:54,927 >> The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
+[INFO|trainer.py:2519] 2025-10-22 16:11:55,091 >> ***** Running training *****
+[INFO|trainer.py:2520] 2025-10-22 16:11:55,091 >> Num examples = 48,600
+[INFO|trainer.py:2521] 2025-10-22 16:11:55,091 >> Num Epochs = 1
+[INFO|trainer.py:2522] 2025-10-22 16:11:55,091 >> Instantaneous batch size per device = 1
+[INFO|trainer.py:2525] 2025-10-22 16:11:55,091 >> Total train batch size (w. parallel, distributed & accumulation) = 4
+[INFO|trainer.py:2526] 2025-10-22 16:11:55,091 >> Gradient Accumulation steps = 1
+[INFO|trainer.py:2527] 2025-10-22 16:11:55,091 >> Total optimization steps = 100
+[INFO|trainer.py:2528] 2025-10-22 16:11:55,092 >> Number of trainable parameters = 4,399,104
+[INFO|integration_utils.py:867] 2025-10-22 16:11:55,115 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
+wandb: Currently logged in as: zsprague (ut_nlp_deduce) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: Tracking run with wandb version 0.22.2
+wandb: Run data is saved locally in /scratch/zrs2020/LlamaFactoryHelper/wandb/run-20251022_161155-mev7yv4q
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run interactive_test
+wandb: View project at https://wandb.ai/ut_nlp_deduce/llamafactory
+wandb: View run at https://wandb.ai/ut_nlp_deduce/llamafactory/runs/mev7yv4q
+ 0%| | 0/100 [00:00, ?it/s] 1%| | 1/100 [00:00<01:20, 1.22it/s] 2%| | 2/100 [00:00<00:42, 2.29it/s] 3%| | 3/100 [00:01<00:30, 3.23it/s] 4%| | 4/100 [00:01<00:26, 3.62it/s] 5%| | 5/100 [00:01<00:26, 3.62it/s] 6%| | 6/100 [00:01<00:25, 3.75it/s] 7%| | 7/100 [00:02<00:24, 3.81it/s] 8%| | 8/100 [00:02<00:22, 4.10it/s] 9%| | 9/100 [00:02<00:21, 4.26it/s] 10%| | 10/100 [00:02<00:19, 4.60it/s] {'loss': 0.8095, 'grad_norm': 0.4082973599433899, 'learning_rate': 4.55e-05, 'epoch': 0.0}
+ 10%| | 10/100 [00:02<00:19, 4.60it/s] 11%| | 11/100 [00:02<00:19, 4.58it/s] 12%| | 12/100 [00:03<00:26, 3.37it/s] 13%| | 13/100 [00:03<00:22, 3.95it/s] 14%| | 14/100 [00:03<00:19, 4.49it/s] 15%| | 15/100 [00:03<00:17, 4.83it/s] 16%| | 16/100 [00:04<00:16, 5.02it/s] 17%| | 17/100 [00:04<00:18, 4.54it/s] 18%| | 18/100 [00:04<00:19, 4.31it/s] 19%| | 19/100 [00:04<00:19, 4.09it/s] 20%| | 20/100 [00:05<00:18, 4.30it/s] {'loss': 0.7526, 'grad_norm': 0.3977344334125519, 'learning_rate': 4.05e-05, 'epoch': 0.0}
+ 20%| | 20/100 [00:05<00:18, 4.30it/s] 21%| | 21/100 [00:05<00:19, 4.15it/s] 22%| | 22/100 [00:05<00:17, 4.34it/s] 23%| | 23/100 [00:05<00:18, 4.17it/s] 24%| | 24/100 [00:06<00:16, 4.48it/s] 25%| | 25/100 [00:06<00:18, 4.07it/s] 26%| | 26/100 [00:06<00:17, 4.20it/s] 27%| | 27/100 [00:06<00:17, 4.12it/s] 28%| | 28/100 [00:07<00:19, 3.67it/s] 29%| | 29/100 [00:07<00:18, 3.77it/s] 30%| | 30/100 [00:07<00:18, 3.87it/s] {'loss': 0.7383, 'grad_norm': 0.4655061662197113, 'learning_rate': 3.55e-05, 'epoch': 0.0}
+ 30%| | 30/100 [00:07<00:18, 3.87it/s] 31%| | 31/100 [00:07<00:17, 3.84it/s] 32%| | 32/100 [00:08<00:16, 4.06it/s] 33%| | 33/100 [00:08<00:15, 4.45it/s] 34%| | 34/100 [00:08<00:13, 4.89it/s] 35%| | 35/100 [00:08<00:14, 4.58it/s] 36%| | 36/100 [00:08<00:12, 5.03it/s] 37%| | 37/100 [00:09<00:13, 4.70it/s] 38%| | 38/100 [00:09<00:13, 4.66it/s] 39%| | 39/100 [00:09<00:12, 4.98it/s] 40%| | 40/100 [00:09<00:13, 4.59it/s] {'loss': 0.7139, 'grad_norm': 0.37491023540496826, 'learning_rate': 3.05e-05, 'epoch': 0.0}
+ 40%| | 40/100 [00:09<00:13, 4.59it/s] 41%| | 41/100 [00:10<00:14, 4.10it/s] 42%| | 42/100 [00:10<00:14, 4.08it/s] 43%| | 43/100 [00:10<00:12, 4.58it/s] 44%| | 44/100 [00:10<00:11, 4.90it/s] 45%| | 45/100 [00:10<00:10, 5.33it/s] 46%| | 46/100 [00:10<00:10, 4.98it/s] 47%| | 47/100 [00:11<00:10, 4.89it/s] 48%| | 48/100 [00:11<00:10, 5.02it/s] 49%| | 49/100 [00:11<00:11, 4.44it/s] 50%| | 50/100 [00:11<00:11, 4.45it/s] {'loss': 0.6497, 'grad_norm': 0.5901727676391602, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.0}
+ 50%| | 50/100 [00:11<00:11, 4.45it/s][INFO|trainer.py:4309] 2025-10-22 16:12:08,223 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50
+[INFO|configuration_utils.py:765] 2025-10-22 16:12:08,395 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:12:08,396 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:08,565 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:08,585 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:08,589 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-50/special_tokens_map.json
+ 51%| | 51/100 [00:13<00:24, 2.02it/s] 52%| | 52/100 [00:13<00:20, 2.31it/s] 53%| | 53/100 [00:13<00:17, 2.67it/s] 54%| | 54/100 [00:13<00:14, 3.24it/s] 55%| | 55/100 [00:13<00:13, 3.34it/s] 56%| | 56/100 [00:14<00:11, 3.95it/s] 57%| | 57/100 [00:14<00:10, 4.02it/s] 58%| | 58/100 [00:14<00:09, 4.44it/s] 59%| | 59/100 [00:14<00:07, 5.13it/s] 60%| | 60/100 [00:14<00:07, 5.54it/s] {'loss': 0.6288, 'grad_norm': 0.4912378787994385, 'learning_rate': 2.05e-05, 'epoch': 0.0}
+ 60%| | 60/100 [00:14<00:07, 5.54it/s] 61%| | 61/100 [00:15<00:07, 5.17it/s] 62%| | 62/100 [00:15<00:06, 5.72it/s] 63%| | 63/100 [00:15<00:07, 5.16it/s] 64%| | 64/100 [00:15<00:06, 5.46it/s] 65%| | 65/100 [00:15<00:07, 4.98it/s] 66%| | 66/100 [00:16<00:07, 4.60it/s] 67%| | 67/100 [00:16<00:06, 4.87it/s] 68%| | 68/100 [00:16<00:07, 4.54it/s] 69%| | 69/100 [00:16<00:07, 4.16it/s] 70%| | 70/100 [00:17<00:07, 4.10it/s] {'loss': 0.6135, 'grad_norm': 0.521141767501831, 'learning_rate': 1.55e-05, 'epoch': 0.01}
+ 70%| | 70/100 [00:17<00:07, 4.10it/s] 71%| | 71/100 [00:17<00:07, 3.97it/s] 72%| | 72/100 [00:17<00:06, 4.49it/s] 73%| | 73/100 [00:17<00:06, 4.00it/s] 74%| | 74/100 [00:17<00:06, 4.20it/s] 75%| | 75/100 [00:18<00:05, 4.70it/s] 76%| | 76/100 [00:18<00:05, 4.73it/s] 77%| | 77/100 [00:18<00:04, 5.24it/s] 78%| | 78/100 [00:18<00:04, 4.69it/s] 79%| | 79/100 [00:18<00:04, 4.55it/s] 80%| | 80/100 [00:19<00:04, 4.27it/s] {'loss': 0.6435, 'grad_norm': 0.4013785123825073, 'learning_rate': 1.05e-05, 'epoch': 0.01}
+ 80%| | 80/100 [00:19<00:04, 4.27it/s] 81%| | 81/100 [00:19<00:04, 4.66it/s] 82%| | 82/100 [00:19<00:04, 4.06it/s] 83%| | 83/100 [00:19<00:03, 4.46it/s] 84%| | 84/100 [00:20<00:03, 4.40it/s] 85%| | 85/100 [00:20<00:03, 4.46it/s] 86%| | 86/100 [00:20<00:02, 5.06it/s] 87%| | 87/100 [00:20<00:02, 5.19it/s] 88%| | 88/100 [00:20<00:02, 4.88it/s] 89%| | 89/100 [00:21<00:02, 4.59it/s] 90%| | 90/100 [00:21<00:01, 5.20it/s] {'loss': 0.6314, 'grad_norm': 0.544479489326477, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.01}
+ 90%| | 90/100 [00:21<00:01, 5.20it/s] 91%| | 91/100 [00:21<00:01, 4.64it/s] 92%|| 92/100 [00:21<00:01, 4.52it/s] 93%|| 93/100 [00:21<00:01, 4.74it/s] 94%|| 94/100 [00:22<00:01, 4.69it/s] 95%|| 95/100 [00:22<00:01, 4.78it/s] 96%|| 96/100 [00:22<00:00, 4.42it/s] 97%|| 97/100 [00:23<00:00, 3.84it/s] 98%|| 98/100 [00:23<00:00, 4.26it/s] 99%|| 99/100 [00:23<00:00, 4.53it/s]100%|| 100/100 [00:23<00:00, 4.31it/s] {'loss': 0.6241, 'grad_norm': 0.4398234486579895, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.01}
+100%|| 100/100 [00:23<00:00, 4.31it/s][INFO|trainer.py:4309] 2025-10-22 16:12:19,957 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+[INFO|configuration_utils.py:765] 2025-10-22 16:12:20,123 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:12:20,124 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:20,301 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:20,337 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:20,361 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100/special_tokens_map.json
+[INFO|trainer.py:2810] 2025-10-22 16:12:20,905 >>
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+ {'train_runtime': 25.8131, 'train_samples_per_second': 15.496, 'train_steps_per_second': 3.874, 'train_loss': 0.6805157041549683, 'epoch': 0.01}
+100%|| 100/100 [00:24<00:00, 4.31it/s]100%|| 100/100 [00:24<00:00, 4.07it/s]
+[INFO|trainer.py:4309] 2025-10-22 16:12:20,914 >> Saving model checkpoint to /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+[INFO|configuration_utils.py:765] 2025-10-22 16:12:20,991 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:12:20,992 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:21,136 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:21,141 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:21,146 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/special_tokens_map.json
+***** train metrics *****
+ epoch = 0.0082
+ total_flos = 1473847GF
+ train_loss = 0.6805
+ train_runtime = 0:00:25.81
+ train_samples_per_second = 15.496
+ train_steps_per_second = 3.874
+[INFO|modelcard.py:456] 2025-10-22 16:12:21,372 >> Dropping the following result as it does not have all the necessary fields:
+{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
+gl064:2373550:2373550 [1] NCCL INFO comm 0x123b0010 rank 1 nranks 4 cudaDev 1 busId 59000 - Destroy COMPLETE
+gl064:2373549:2373549 [0] NCCL INFO comm 0x156177e0 rank 0 nranks 4 cudaDev 0 busId 47000 - Destroy COMPLETE
+[1;34mwandb[0m:
+[1;34mwandb[0m: View run [33minteractive_test[0m at: [34m[0m
+[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20251022_161155-mev7yv4q/logs[0m
+
+========================================
+Training completed successfully
+End Time: Wed Oct 22 04:12:23 PM EDT 2025
+========================================
+
+========================================
+STAGE 2: Merging/Exporting Model
+Start Time: Wed Oct 22 04:12:23 PM EDT 2025
+========================================
+Looking for checkpoints in: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints
+Analyzing checkpoints to find the one from current training run...
+ - checkpoint-100: trainer_state.json modified at Wed Oct 22 04:12:20 PM EDT 2025
+ - checkpoint-150: trainer_state.json modified at Wed Oct 22 04:02:30 PM EDT 2025
+ - checkpoint-50: trainer_state.json modified at Wed Oct 22 04:12:09 PM EDT 2025
+
+Selected checkpoint: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+This checkpoint has the most recently updated trainer_state.json
+Checkpoint details:
+ Path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+ Last modified: 2025-10-22 16:02:17.627741631 -0400
+ Training step: 100
+Updating merge config to point to checkpoint...
+Successfully updated merge config
+Updated merge config to use: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+
+Merge config contents:
+ model_name_or_path: Qwen/Qwen2.5-0.5B
+ finetuning_type: lora
+ trust_remote_code: true
+ adapter_name_or_path: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+ template: default
+ export_dir: /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged
+
+Executing command: llamafactory-cli export /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/configs/merge_config.yaml
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/transformers/utils/hub.py:110: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+ warnings.warn(
+/scratch/zrs2020/miniconda/miniconda3/envs/llamafactory/lib/python3.12/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+ import pkg_resources
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:32,643 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:12:32,816 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:12:33,035 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:12:33,036 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file vocab.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/vocab.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file merges.txt from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/merges.txt
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file tokenizer.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file added_tokens.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file special_tokens_map.json from cache at None
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file tokenizer_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2095] 2025-10-22 16:12:33,123 >> loading file chat_template.jinja from cache at None
+[INFO|tokenization_utils_base.py:2364] 2025-10-22 16:12:33,289 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+[INFO|configuration_utils.py:765] 2025-10-22 16:12:33,337 >> loading configuration file config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
+[INFO|configuration_utils.py:839] 2025-10-22 16:12:33,338 >> Model config Qwen2Config {
+ "architectures": [
+ "Qwen2ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 151643,
+ "dtype": "bfloat16",
+ "eos_token_id": 151643,
+ "hidden_act": "silu",
+ "hidden_size": 896,
+ "initializer_range": 0.02,
+ "intermediate_size": 4864,
+ "layer_types": [
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention",
+ "full_attention"
+ ],
+ "max_position_embeddings": 32768,
+ "max_window_layers": 24,
+ "model_type": "qwen2",
+ "num_attention_heads": 14,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "rms_norm_eps": 1e-06,
+ "rope_scaling": null,
+ "rope_theta": 1000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": true,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_mrope": false,
+ "use_sliding_window": false,
+ "vocab_size": 151936
+}
+
+[WARNING|logging.py:328] 2025-10-22 16:12:33,338 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|2025-10-22 16:12:33] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
+[WARNING|logging.py:328] 2025-10-22 16:12:33,651 >> `torch_dtype` is deprecated! Use `dtype` instead!
+[INFO|modeling_utils.py:1172] 2025-10-22 16:12:33,651 >> loading weights file model.safetensors from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/model.safetensors
+[INFO|modeling_utils.py:2341] 2025-10-22 16:12:33,652 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
+[INFO|configuration_utils.py:986] 2025-10-22 16:12:33,653 >> Generate config GenerationConfig {
+ "bos_token_id": 151643,
+ "eos_token_id": 151643
+}
+
+[INFO|configuration_utils.py:941] 2025-10-22 16:12:33,738 >> loading configuration file generation_config.json from cache at /scratch/zrs2020/.cache/hf_cache/home/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/generation_config.json
+[INFO|configuration_utils.py:986] 2025-10-22 16:12:33,739 >> Generate config GenerationConfig {
+ "bos_token_id": 151643,
+ "eos_token_id": 151643,
+ "max_new_tokens": 2048
+}
+
+[INFO|dynamic_module_utils.py:423] 2025-10-22 16:12:33,767 >> Could not locate the custom_generate/generate.py inside Qwen/Qwen2.5-0.5B.
+[INFO|2025-10-22 16:12:33] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
+[INFO|2025-10-22 16:12:34] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
+[INFO|2025-10-22 16:12:34] llamafactory.model.adapter:143 >> Loaded adapter(s): /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/checkpoints/checkpoint-100
+[INFO|2025-10-22 16:12:34] llamafactory.model.loader:143 >> all params: 494,032,768
+[INFO|2025-10-22 16:12:34] llamafactory.train.tuner:143 >> Convert model dtype to: torch.bfloat16.
+[INFO|configuration_utils.py:491] 2025-10-22 16:12:34,577 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/config.json
+[INFO|configuration_utils.py:757] 2025-10-22 16:12:34,582 >> Configuration saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/generation_config.json
+[INFO|modeling_utils.py:4181] 2025-10-22 16:12:36,078 >> Model weights saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/model.safetensors
+[INFO|tokenization_utils_base.py:2421] 2025-10-22 16:12:36,082 >> chat template saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/chat_template.jinja
+[INFO|tokenization_utils_base.py:2590] 2025-10-22 16:12:36,087 >> tokenizer config file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2599] 2025-10-22 16:12:36,092 >> Special tokens file saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/special_tokens_map.json
+[INFO|2025-10-22 16:12:36] llamafactory.train.tuner:143 >> Ollama modelfile saved in /scratch/zrs2020/LlamaFactoryHelper/experiments/lf_torch_test__interactive/merged/Modelfile
+
+========================================
+Merge/Export completed successfully
+End Time: Wed Oct 22 04:12:37 PM EDT 2025
+========================================
+
+========================================
+Preparing Training Artifacts
+========================================
+Copying configuration files...
+Copying and cleaning training logs...