diff --git "a/run_sd3_lora_sampling.log" "b/run_sd3_lora_sampling.log" new file mode 100644--- /dev/null +++ "b/run_sd3_lora_sampling.log" @@ -0,0 +1,202 @@ +nohup: ignoring input +开始SD3 LoRA采样(从checkpoint加载)... +模型: /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671 +LoRA Checkpoint路径: /gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000 +LoRA Rank: 32 +Caption文件: /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +每个caption生成图片数: 3 +图像尺寸: 512x512 +引导尺度: 7.0 +推理步数: 40 +[run_sd3_lora_sampling.sh] 开始 torchrun: Fri Mar 20 10:39:17 CST 2026 +W0320 10:39:21.131000 17320 site-packages/torch/distributed/run.py:793] +W0320 10:39:21.131000 17320 site-packages/torch/distributed/run.py:793] ***************************************** +W0320 10:39:21.131000 17320 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0320 10:39:21.131000 17320 site-packages/torch/distributed/run.py:793] ***************************************** +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[W320 10:39:37.641147474 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator()) +[W320 10:39:38.716913023 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator()) +Starting rank=2, device=cuda:2, seed=170, world_size=4, visible_devices=4. +Starting rank=3, device=cuda:3, seed=171, world_size=4, visible_devices=4. +Loaded 13443 captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +Loaded 13443 captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +[W320 10:39:38.588759066 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator()) +[W320 10:39:39.838649997 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator()) +Starting rank=0, device=cuda:0, seed=168, world_size=4, visible_devices=4. +Loading captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +Starting rank=1, device=cuda:1, seed=169, world_size=4, visible_devices=4. +Loaded 13443 captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +Will generate 3 images for each of 13443 captions +Total images requested: 40329 +Max samples limit: 30000 +Total images to generate: 30000 +Loading SD3 pipeline from /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671 +Loaded 13443 captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl + Loading pipeline components...: 0%| | 0/9 [00:00 + sys.exit(main()) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper + return f(*args, **kwargs) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main + run(args) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run + elastic_launch( + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +========================================================== +sample_sd3_lora_checkpoint_ddp.py FAILED +---------------------------------------------------------- +Failures: +[1]: + time : 2026-03-20_11:03:42 + host : 6f8cc9898f522c47caca7fc0e85492cc-taskrole1-0 + rank : 3 (local_rank: 3) + exitcode : -6 (pid: 17418) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 17418 +---------------------------------------------------------- +Root Cause (first observed failure): +[0]: + time : 2026-03-20_11:03:42 + host : 6f8cc9898f522c47caca7fc0e85492cc-taskrole1-0 + rank : 2 (local_rank: 2) + exitcode : -6 (pid: 17417) + error_file: + traceback : Signal 6 (SIGABRT) received by PID 17417 +========================================================== +[run_sd3_lora_sampling.sh] torchrun异常退出,exit code=0 +尝试降级到单GPU模式以确认问题 +[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds. +[W320 11:04:00.997951053 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator()) +Starting rank=0, device=cuda:0, seed=42, world_size=1, visible_devices=4. +Loading captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +Loaded 13443 captions from /gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl +Will generate 3 images for each of 13443 captions +Total images requested: 40329 +Max samples limit: 30000 +Total images to generate: 30000 +Loading SD3 pipeline from /gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671 + Loading pipeline components...: 0%| | 0/9 [00:00