#!/bin/bash set -euo pipefail # 分布式采样:指定 LoRA 与 Rectified(SIT) 权重 export CUDA_VISIBLE_DEVICES="0,1,2,3" export NCCL_DEBUG=INFO export NCCL_DEBUG_SUBSYS=ALL export NCCL_IB_DISABLE=1 export NCCL_P2P_LEVEL=SYS PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671" #"/gemini/space/zhaozy/zhy/hsd/project/pretrained_model/models--stabilityai--stable-diffusion-3-medium-diffusers" LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000" # 可为空 RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-120000" # 可为空(若不用 Rectified) CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl" SAMPLE_DIR="./sd3_rectified_samples_new_batch_2" NUM_INFERENCE_STEPS=40 GUIDANCE_SCALE=7.0 HEIGHT=512 WIDTH=512 PER_PROC_BATCH_SIZE=32 IMAGES_PER_CAPTION=3 MAX_SAMPLES=30000 GLOBAL_SEED=42 MIXED_PRECISION="fp16" # no / fp16 / bf16 NUM_SIT_LAYERS=1 # 需与训练一致 ARGS=( --pretrained_model_name_or_path "$PRETRAINED_MODEL" --captions_jsonl "$CAPTIONS_JSONL" --sample_dir "$SAMPLE_DIR" --num_inference_steps $NUM_INFERENCE_STEPS --guidance_scale $GUIDANCE_SCALE --height $HEIGHT --width $WIDTH --per_proc_batch_size $PER_PROC_BATCH_SIZE --images_per_caption $IMAGES_PER_CAPTION --max_samples $MAX_SAMPLES --global_seed $GLOBAL_SEED --num_sit_layers $NUM_SIT_LAYERS --mixed_precision $MIXED_PRECISION ) if [ -n "$LORA_PATH" ]; then ARGS+=(--lora_path "$LORA_PATH") fi if [ -n "$RECTIFIED_WEIGHTS" ]; then ARGS+=(--rectified_weights "$RECTIFIED_WEIGHTS") fi echo "[run_sd3_rectified_sampling.sh] start torchrun: $(date)" # 先尝试 4 卡模式,如果失败则退到单卡模式 if ! torchrun --nproc_per_node=4 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then ret=$? echo "[run_sd3_rectified_sampling.sh] 4卡运行失败(退出码 ${ret}),尝试单卡模式" if ! torchrun --nproc_per_node=1 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then ret2=$? echo "[run_sd3_rectified_sampling.sh] 单卡运行也失败(退出码 ${ret2}),请查看具体错误信息。" exit $ret2 fi echo "[run_sd3_rectified_sampling.sh] 单卡运行成功,建议降低 per_proc_batch_size 或使用单卡配置继续。" fi wait echo "Sampling done. Output at: $SAMPLE_DIR" # nohup bash run_sd3_rectified_sampling.sh > run_sd3_rectified_sampling.log 2>&1 &