#!/bin/bash # SD3 Rectified Noise Training Script # 这个脚本展示了如何使用 train_rectified_noise.py 进行训练 set -e # 激活正确的conda环境 source /root/miniconda3/etc/profile.d/conda.sh conda activate SiT # 基础配置 export CUDA_VISIBLE_DEVICES=0,1,2,3 # 设置使用4个GPU(0,1,2,3) #export OMP_NUM_THREADS=1 # 内存优化设置 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export TOKENIZERS_PARALLELISM=false # 模型和数据路径 PRETRAINED_MODEL="/gemini/space/hsd/project/pretrained_model/huggingface/hub/models--stabilityai--stable-diffusion-3-medium-diffusers/snapshots/ea42f8cef0f178587cf766dc8129abd379c90671" LORA_MODEL_PATH="/gemini/space/gzy_new/models/Sida/sd3-lora-finetuned-batch-4/checkpoint-500000" # LoRA微调后的SD3模型路径 TRAIN_DATA_DIR="/gemini/space/hsd/project/dataset/cc3m-wds/train" # 训练数据目录 OUTPUT_DIR="./rectified-noise-batch-2" # 输出目录 # 训练参数 NUM_SIT_LAYERS=1 # SIT块的层数 SIT_LEARNING_RATE=1e-5 # SIT块的学习率 KL_LOSS_WEIGHT=0.5 # KL散度损失权重 RESOLUTION=512 # 图像分辨率 BATCH_SIZE=2 # 批次大小 GRADIENT_ACCUMULATION=2 # 梯度累积步数 MAX_TRAIN_STEPS=500000 # 最大训练步数 # 验证参数 VALIDATION_PROMPT="A bicycle replica with a clock as the front wheel." NUM_VALIDATION_IMAGES=1 echo "开始 SD3 Rectified Noise 训练..." echo "LoRA模型路径: $LORA_MODEL_PATH" echo "SIT层数: $NUM_SIT_LAYERS" echo "输出目录: $OUTPUT_DIR" # 检查LoRA模型路径是否存在 if [ ! -d "$LORA_MODEL_PATH" ]; then echo "错误: LoRA模型路径不存在: $LORA_MODEL_PATH" echo "请先使用 train_lora_sd3.py 训练LoRA模型" exit 1 fi # 使用accelerate启动训练 # 注意:移除了命令行中的mixed_precision参数,因为已经在accelerate_config.yaml中设置 accelerate launch --config_file accelerate_config.yaml train_rectified_noise.py \ --pretrained_model_name_or_path="$PRETRAINED_MODEL" \ --lora_model_path="$LORA_MODEL_PATH" \ --train_data_dir="$TRAIN_DATA_DIR" \ --num_sit_layers=$NUM_SIT_LAYERS \ --sit_learning_rate=$SIT_LEARNING_RATE \ --kl_loss_weight=$KL_LOSS_WEIGHT \ --resolution=$RESOLUTION \ --train_batch_size=$BATCH_SIZE \ --gradient_accumulation_steps=$GRADIENT_ACCUMULATION \ --gradient_checkpointing \ --learning_rate=1e-5 \ --time_weight_alpha=5.0 \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ --max_train_steps=$MAX_TRAIN_STEPS \ --output_dir="$OUTPUT_DIR" \ --validation_prompt="$VALIDATION_PROMPT" \ --num_validation_images=$NUM_VALIDATION_IMAGES \ --validation_steps=20000 \ --seed=42 \ --dataloader_num_workers=8 \ --save_sit_weights_only \ --checkpointing_steps=20000 \ --checkpoints_total_limit=10 \ --report_to="tensorboard" \ --logging_dir="./logs" echo "训练完成!" echo "SIT权重保存在: $OUTPUT_DIR/sit_weights/" echo "验证图像保存在: $OUTPUT_DIR/validation_images/" # 可选:快速测试训练命令 # cat << 'EOF' # # 快速测试命令(少量步数): # accelerate launch train_rectified_noise.py \ # --pretrained_model_name_or_path="stabilityai/stable-diffusion-3-medium-diffusers" \ # --lora_model_path="./sd3-lora-finetuned" \ # --train_data_dir="./dataset" \ # --num_sit_layers=2 \ # --resolution=256 \ # --train_batch_size=1 \ # --gradient_accumulation_steps=4 \ # --max_train_steps=100 \ # --output_dir="./test-rectified-noise" \ # --mixed_precision="fp16" \ # --save_sit_weights_only # EOF # nohup bash train_rectified_noise.sh > train_rectified_noise.log 2>&1 &