| #!/bin/bash |
| set -euo pipefail |
|
|
| |
|
|
| export CUDA_VISIBLE_DEVICES="0,1,2,3" |
| export NCCL_DEBUG=INFO |
| export NCCL_DEBUG_SUBSYS=ALL |
| export NCCL_IB_DISABLE=1 |
| export NCCL_P2P_LEVEL=SYS |
|
|
| PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671" |
| |
| LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000" |
| RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-120000" |
|
|
| CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl" |
| SAMPLE_DIR="./sd3_rectified_samples_new_batch_2" |
|
|
| NUM_INFERENCE_STEPS=40 |
| GUIDANCE_SCALE=7.0 |
| HEIGHT=512 |
| WIDTH=512 |
| PER_PROC_BATCH_SIZE=32 |
| IMAGES_PER_CAPTION=3 |
| MAX_SAMPLES=30000 |
| GLOBAL_SEED=42 |
| MIXED_PRECISION="fp16" |
| NUM_SIT_LAYERS=1 |
|
|
| ARGS=( |
| --pretrained_model_name_or_path "$PRETRAINED_MODEL" |
| --captions_jsonl "$CAPTIONS_JSONL" |
| --sample_dir "$SAMPLE_DIR" |
| --num_inference_steps $NUM_INFERENCE_STEPS |
| --guidance_scale $GUIDANCE_SCALE |
| --height $HEIGHT |
| --width $WIDTH |
| --per_proc_batch_size $PER_PROC_BATCH_SIZE |
| --images_per_caption $IMAGES_PER_CAPTION |
| --max_samples $MAX_SAMPLES |
| --global_seed $GLOBAL_SEED |
| --num_sit_layers $NUM_SIT_LAYERS |
| --mixed_precision $MIXED_PRECISION |
| ) |
|
|
| if [ -n "$LORA_PATH" ]; then |
| ARGS+=(--lora_path "$LORA_PATH") |
| fi |
|
|
| if [ -n "$RECTIFIED_WEIGHTS" ]; then |
| ARGS+=(--rectified_weights "$RECTIFIED_WEIGHTS") |
| fi |
|
|
| echo "[run_sd3_rectified_sampling.sh] start torchrun: $(date)" |
|
|
| |
| if ! torchrun --nproc_per_node=4 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then |
| ret=$? |
| echo "[run_sd3_rectified_sampling.sh] 4卡运行失败(退出码 ${ret}),尝试单卡模式" |
| if ! torchrun --nproc_per_node=1 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then |
| ret2=$? |
| echo "[run_sd3_rectified_sampling.sh] 单卡运行也失败(退出码 ${ret2}),请查看具体错误信息。" |
| exit $ret2 |
| fi |
| echo "[run_sd3_rectified_sampling.sh] 单卡运行成功,建议降低 per_proc_batch_size 或使用单卡配置继续。" |
| fi |
|
|
| wait |
|
|
| echo "Sampling done. Output at: $SAMPLE_DIR" |
| |