#!/bin/bash # E-GRPO (G2RPO) training script for QwenImage # Based on finetune_mergestep.sh configuration set -e # GPU 6 is faulty - use only the 6 confirmed working GPUs: 0,1,2,3,4,5 # Force set these values (override any existing env vars) export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5" NPROC_PER_NODE=6 NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} MASTER_ADDR=${MASTER_ADDR:-localhost} MASTER_PORT=${MASTER_PORT:-29501} # NCCL configuration to fix communication hangs export NCCL_P2P_DISABLE=1 # Disable P2P (peer-to-peer GPU communication) export NCCL_IB_DISABLE=1 # Disable InfiniBand export NCCL_SHM_DISABLE=0 # Keep shared memory enabled export NCCL_SOCKET_IFNAME=lo # Use localhost interface export NCCL_DEBUG=WARN # Show warnings # Change to source_code directory cd "$(dirname "$0")/.." echo "==========================================" echo "E-GRPO Training for QwenImage" echo "==========================================" echo "Nodes: $NNODES" echo "GPUs per node: $NPROC_PER_NODE" echo "Master addr: $MASTER_ADDR" echo "Master port: $MASTER_PORT" echo "==========================================" # Run training python -m torch.distributed.run \ --nnodes=$NNODES \ --nproc_per_node=$NPROC_PER_NODE \ --node_rank=$NODE_RANK \ --master_addr=$MASTER_ADDR \ --master_port=$MASTER_PORT \ fastvideo/train_g2rpo_qwenimage_merge.py \ --pretrained_model_name_or_path ./data/QwenImage \ --data_json_path ./data/qwenimage_rl_embeddings/videos2caption.json \ --output_dir ./output/g2rpo_qwenimage \ --hps_path ./data/hps/HPS_v2.1_compressed.pt \ --hps_clip_path ./data/hps/open_clip_pytorch_model.bin \ --h 1024 \ --w 1024 \ --sampling_steps 16 \ --eta 0.7 \ --shift 3.0 \ --num_generations 12 \ --learning_rate 2e-6 \ --max_train_steps 301 \ --checkpointing_steps 50 \ --eta_step_list 0 1 2 3 4 5 6 7 \ --eta_step_merge_list 1 1 1 2 2 2 3 3 \ --granular_list 1 \ --init_same_noise \ --clip_range 1e-4 \ --adv_clip_max 5.0 \ --use_hpsv2 echo "Training completed!"