xiangzai
/

sida

Model card Files Files and versions

sida / run_sd3_rectified_sampling_old.sh

xiangzai's picture

Add files using upload-large-folder tool

7803bdf verified 6 days ago

history blame contribute delete

2.62 kB

	#!/bin/bash
	set -euo pipefail

	# 分布式采样：指定 LoRA 与 Rectified(SIT) 权重

	export CUDA_VISIBLE_DEVICES="0,1,2,3"
	export NCCL_DEBUG=INFO
	export NCCL_DEBUG_SUBSYS=ALL
	export NCCL_IB_DISABLE=1
	export NCCL_P2P_LEVEL=SYS

	PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671"
	#"/gemini/space/zhaozy/zhy/hsd/project/pretrained_model/models--stabilityai--stable-diffusion-3-medium-diffusers"
	LORA_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000" # 可为空
	RECTIFIED_WEIGHTS="/gemini/space/gzy_new/models/Sida/rectified-noise-batch-2/checkpoint-120000" # 可为空（若不用 Rectified）

	CAPTIONS_JSONL="/gemini/space/hsd/project/dataset/cc3m-wds/validation/metadata.jsonl"
	SAMPLE_DIR="./sd3_rectified_samples_new_batch_2"

	NUM_INFERENCE_STEPS=40
	GUIDANCE_SCALE=7.0
	HEIGHT=512
	WIDTH=512
	PER_PROC_BATCH_SIZE=32
	IMAGES_PER_CAPTION=3
	MAX_SAMPLES=30000
	GLOBAL_SEED=42
	MIXED_PRECISION="fp16" # no / fp16 / bf16
	NUM_SIT_LAYERS=1 # 需与训练一致

	ARGS=(
	--pretrained_model_name_or_path "$PRETRAINED_MODEL"
	--captions_jsonl "$CAPTIONS_JSONL"
	--sample_dir "$SAMPLE_DIR"
	--num_inference_steps $NUM_INFERENCE_STEPS
	--guidance_scale $GUIDANCE_SCALE
	--height $HEIGHT
	--width $WIDTH
	--per_proc_batch_size $PER_PROC_BATCH_SIZE
	--images_per_caption $IMAGES_PER_CAPTION
	--max_samples $MAX_SAMPLES
	--global_seed $GLOBAL_SEED
	--num_sit_layers $NUM_SIT_LAYERS
	--mixed_precision $MIXED_PRECISION
	)

	if [ -n "$LORA_PATH" ]; then
	ARGS+=(--lora_path "$LORA_PATH")
	fi

	if [ -n "$RECTIFIED_WEIGHTS" ]; then
	ARGS+=(--rectified_weights "$RECTIFIED_WEIGHTS")
	fi

	echo "[run_sd3_rectified_sampling.sh] start torchrun: $(date)"

	# 先尝试 4 卡模式，如果失败则退到单卡模式
	if ! torchrun --nproc_per_node=4 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then
	ret=$?
	echo "[run_sd3_rectified_sampling.sh] 4卡运行失败(退出码 ${ret})，尝试单卡模式"
	if ! torchrun --nproc_per_node=1 --master_port=25913 sample_sd3_rectified_ddp.py "${ARGS[@]}"; then
	ret2=$?
	echo "[run_sd3_rectified_sampling.sh] 单卡运行也失败(退出码 ${ret2})，请查看具体错误信息。"
	exit $ret2
	fi
	echo "[run_sd3_rectified_sampling.sh] 单卡运行成功，建议降低 per_proc_batch_size 或使用单卡配置继续。"
	fi

	wait

	echo "Sampling done. Output at: $SAMPLE_DIR"
	# nohup bash run_sd3_rectified_sampling.sh > run_sd3_rectified_sampling.log 2>&1 &