diff --git "a/GVP/Baseline/W_No.log" "b/GVP/Baseline/W_No.log" new file mode 100644--- /dev/null +++ "b/GVP/Baseline/W_No.log" @@ -0,0 +1,54 @@ +W0317 13:39:32.843000 17656 site-packages/torch/distributed/run.py:793] +W0317 13:39:32.843000 17656 site-packages/torch/distributed/run.py:793] ***************************************** +W0317 13:39:32.843000 17656 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0317 13:39:32.843000 17656 site-packages/torch/distributed/run.py:793] ***************************************** +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +Starting rank=0, seed=0, world_size=4. +Starting rank=2, seed=2, world_size=4. +Starting rank=1, seed=1, world_size=4. +Starting rank=3, seed=3, world_size=4. +Saving .png samples at GVP_samples/depth-mu-6-0300000-base-cfg-1.0-12-SDE-250-Euler-sigma-Mean-0.04 +Total number of images that will be sampled: 50016 + 0%| | 0/1042 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main + run(args) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run + elastic_launch( + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +========================================================== +sample_rectified_noise.py FAILED +---------------------------------------------------------- +Failures: + +---------------------------------------------------------- +Root Cause (first observed failure): +[0]: + time : 2026-03-17_22:36:25 + host : c2bc8fcbbd719a96f88570734800e3a3-taskrole1-0 + rank : 3 (local_rank: 3) + exitcode : -6 (pid: 17739) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 17739 +==========================================================