diff --git "a/GVP/Baseline/compare_sampling.log" "b/GVP/Baseline/compare_sampling.log" new file mode 100644--- /dev/null +++ "b/GVP/Baseline/compare_sampling.log" @@ -0,0 +1,110 @@ +W0408 14:16:40.592000 6830 site-packages/torch/distributed/run.py:793] +W0408 14:16:40.592000 6830 site-packages/torch/distributed/run.py:793] ***************************************** +W0408 14:16:40.592000 6830 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0408 14:16:40.592000 6830 site-packages/torch/distributed/run.py:793] ***************************************** +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. + 0%| | 0/1042 [00:00 > >) + 0x282 (0x7fa985c2a772 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7fa985c31bb3 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7fa985c3361d in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0x145c0 (0x7fa9d01855c0 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch.so) +frame #5: + 0x94ac3 (0x7fa9d6e6cac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7fa9d6efdbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +terminate called after throwing an instance of 'c10::DistBackendError' +[E409 20:08:57.340247331 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1050, OpType=ALLREDUCE, NumelIn=1, NumelOut=1, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. +Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f09f32b9446 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x282 (0x7f09a922a772 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7f09a9231bb3 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7f09a923361d in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0x145c0 (0x7f09f37795c0 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch.so) +frame #5: + 0x94ac3 (0x7f09fa46cac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7f09fa4fdbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6) + + what(): [PG ID 0 PG GUID 0(default_pg) Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1050, OpType=ALLREDUCE, NumelIn=1, NumelOut=1, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. +Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7fa9cfd6c446 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x282 (0x7fa985c2a772 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7fa985c31bb3 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7fa985c3361d in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0x145c0 (0x7fa9d01855c0 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch.so) +frame #5: + 0x94ac3 (0x7fa9d6e6cac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7fa9d6efdbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1601 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7fa9cfd6c446 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libc10.so) +frame #1: + 0xe4271b (0x7fa9858a071b in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: + 0x145c0 (0x7fa9d01855c0 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch.so) +frame #3: + 0x94ac3 (0x7fa9d6e6cac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #4: clone + 0x44 (0x7fa9d6efdbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6) +terminate called after throwing an instance of 'c10::DistBackendError +' + what(): [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1050, OpType=ALLREDUCE, NumelIn=1, NumelOut=1, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. +Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f09f32b9446 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libc10.so) +frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x282 (0x7f09a922a772 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7f09a9231bb3 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7f09a923361d in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0x145c0 (0x7f09f37795c0 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch.so) +frame #5: + 0x94ac3 (0x7f09fa46cac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7f09fa4fdbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6) + +Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1601 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f09f32b9446 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libc10.so) +frame #1: + 0xe4271b (0x7f09a8ea071b in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) +frame #2: + 0x145c0 (0x7f09f37795c0 in /root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/lib/libtorch.so) +frame #3: + 0x94ac3 (0x7f09fa46cac3 in /usr/lib/x86_64-linux-gnu/libc.so.6) +frame #4: clone + 0x44 (0x7f09fa4fdbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6) + + Building .npz from base: 21%|██ | 10395/50000 [10:00<36:13, 18.22it/s] Building .npz from base: 21%|██ | 10399/50000 [10:00<30:17, 21.79it/s] Building .npz from base: 21%|██ | 10402/50000 [10:00<29:17, 22.53it/s] Building .npz from base: 21%|██ | 10406/50000 [10:01<26:47, 24.63it/s] Building .npz from base: 21%|██ | 10409/50000 [10:01<34:35, 19.07it/s] Building .npz from base: 21%|██ | 10412/50000 [10:01<41:31, 15.89it/s] Building .npz from base: 21%|██ | 10415/50000 [10:01<37:28, 17.61it/s] Building .npz from base: 21%|██ | 10418/50000 [10:01<35:14, 18.72it/s] Building .npz from base: 21%|██ | 10421/50000 [10:02<59:16, 11.13it/s] Building .npz from base: 21%|██ | 10423/50000 [10:02<54:27, 12.11it/s] Building .npz from base: 21%|██ | 10425/50000 [10:02<50:13, 13.13it/s] Building .npz from base: 21%|██ | 10428/50000 [10:02<41:19, 15.96it/s] Building .npz from base: 21%|██ | 10432/50000 [10:02<32:38, 20.21it/s] Building .npz from base: 21%|██ | 10435/50000 [10:02<30:37, 21.53it/s] Building .npz from base: 21%|██ | 10438/50000 [10:03<30:17, 21.77it/s] Building .npz from base: 21%|██ | 10441/50000 [10:03<32:31, 20.27it/s] Building .npz from base: 21%|██ | 10444/50000 [10:03<57:03, 11.55it/s] Building .npz from base: 21%|██ | 10447/50000 [10:03<48:11, 13.68it/s] Building .npz from base: 21%|██ | 10451/50000 [10:04<43:05, 15.30it/s] Building .npz from base: 21%|██ | 10454/50000 [10:04<38:48, 16.98it/s] Building .npz from base: 21%|██ | 10458/50000 [10:04<32:41, 20.16it/s] Building .npz from base: 21%|██ | 10461/50000 [10:04<39:35, 16.64it/s] Building .npz from base: 21%|██ | 10464/50000 [10:04<36:02, 18.28it/s] Building .npz from base: 21%|██ | 10467/50000 [10:04<34:14, 19.24it/s] Building .npz from base: 21%|██ | 10470/50000 [10:04<30:53, 21.32it/s] Building .npz from base: 21%|██ | 10473/50000 [10:05<41:59, 15.69it/s] Building .npz from base: 21%|██ | 10478/50000 [10:05<30:18, 21.74it/s] Building .npz from base: 21%|██ | 10481/50000 [10:05<28:45, 22.90it/s] Building .npz from base: 21%|██ | 10484/50000 [10:05<31:42, 20.77it/s] Building .npz from base: 21%|██ | 10488/50000 [10:05<27:00, 24.38it/s] Building .npz from base: 21%|██ | 10492/50000 [10:05<24:01, 27.41it/s]W0409 20:09:08.031000 6830 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 6922 closing signal SIGTERM +W0409 20:09:08.035000 6830 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 6923 closing signal SIGTERM +W0409 20:09:08.036000 6830 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 6924 closing signal SIGTERM +E0409 20:09:08.752000 6830 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -6) local_rank: 3 (pid: 6925) of binary: /root/miniconda3/envs/SiT/bin/python3.10 +Traceback (most recent call last): + File "/root/miniconda3/envs/SiT/bin/torchrun", line 6, in + sys.exit(main()) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main + run(args) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run + elastic_launch( + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +========================================================== +sample_compare_ddp_rectified.py FAILED +---------------------------------------------------------- +Failures: + +---------------------------------------------------------- +Root Cause (first observed failure): +[0]: + time : 2026-04-09_20:09:08 + host : 280c8972fe62c4ab251b3c74bd05a546-taskrole1-0 + rank : 3 (local_rank: 3) + exitcode : -6 (pid: 6925) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 6925 +==========================================================