| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| |
| |
| export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5" |
| NPROC_PER_NODE=6 |
| NNODES=1 |
| NODE_RANK=0 |
| MASTER_ADDR=localhost |
| MASTER_PORT=29500 |
|
|
| |
| export NCCL_P2P_DISABLE=1 |
| export NCCL_IB_DISABLE=1 |
| export NCCL_SHM_DISABLE=0 |
| export NCCL_SOCKET_IFNAME=lo |
| export NCCL_DEBUG=WARN |
|
|
| |
| cd "$(dirname "$0")/.." |
|
|
| echo "==========================================" |
| echo "E-GRPO Training for Stable Diffusion" |
| echo "==========================================" |
| echo "Nodes: $NNODES" |
| echo "GPUs per node: $NPROC_PER_NODE" |
| echo "Master addr: $MASTER_ADDR" |
| echo "Master port: $MASTER_PORT" |
| echo "==========================================" |
|
|
| |
| python -m torch.distributed.run \ |
| --nnodes=$NNODES \ |
| --nproc_per_node=$NPROC_PER_NODE \ |
| --node_rank=$NODE_RANK \ |
| --master_addr=$MASTER_ADDR \ |
| --master_port=$MASTER_PORT \ |
| fastvideo/train_g2rpo_sd_merge.py \ |
| --config fastvideo/config_sd/base.py \ |
| --eta_step_list 0,1,2,3,4,5,6,7 \ |
| --eta_step_merge_list 1,1,1,2,2,2,3,3 \ |
| --granular_list 1 \ |
| --num_generations 4 \ |
| --eta 1.0 \ |
| --init_same_noise |
|
|
| echo "Training completed!" |
|
|