| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| |
| |
| export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5" |
| NPROC_PER_NODE=6 |
| NNODES=${NNODES:-1} |
| NODE_RANK=${NODE_RANK:-0} |
| MASTER_ADDR=${MASTER_ADDR:-localhost} |
| MASTER_PORT=${MASTER_PORT:-29501} |
|
|
| |
| export NCCL_P2P_DISABLE=1 |
| export NCCL_IB_DISABLE=1 |
| export NCCL_SHM_DISABLE=0 |
| export NCCL_SOCKET_IFNAME=lo |
| export NCCL_DEBUG=WARN |
|
|
| |
| cd "$(dirname "$0")/.." |
|
|
| echo "==========================================" |
| echo "E-GRPO Training for QwenImage" |
| echo "==========================================" |
| echo "Nodes: $NNODES" |
| echo "GPUs per node: $NPROC_PER_NODE" |
| echo "Master addr: $MASTER_ADDR" |
| echo "Master port: $MASTER_PORT" |
| echo "==========================================" |
|
|
| |
| python -m torch.distributed.run \ |
| --nnodes=$NNODES \ |
| --nproc_per_node=$NPROC_PER_NODE \ |
| --node_rank=$NODE_RANK \ |
| --master_addr=$MASTER_ADDR \ |
| --master_port=$MASTER_PORT \ |
| fastvideo/train_g2rpo_qwenimage_merge.py \ |
| --pretrained_model_name_or_path ./data/QwenImage \ |
| --data_json_path ./data/qwenimage_rl_embeddings/videos2caption.json \ |
| --output_dir ./output/g2rpo_qwenimage \ |
| --hps_path ./data/hps/HPS_v2.1_compressed.pt \ |
| --hps_clip_path ./data/hps/open_clip_pytorch_model.bin \ |
| --h 1024 \ |
| --w 1024 \ |
| --sampling_steps 16 \ |
| --eta 0.7 \ |
| --shift 3.0 \ |
| --num_generations 12 \ |
| --learning_rate 2e-6 \ |
| --max_train_steps 301 \ |
| --checkpointing_steps 50 \ |
| --eta_step_list 0 1 2 3 4 5 6 7 \ |
| --eta_step_merge_list 1 1 1 2 2 2 3 3 \ |
| --granular_list 1 \ |
| --init_same_noise \ |
| --clip_range 1e-4 \ |
| --adv_clip_max 5.0 \ |
| --use_hpsv2 |
|
|
| echo "Training completed!" |
|
|