studyOverflow
/

egrpo

Model card Files Files and versions

egrpo / scripts /run_g2rpo_qwenimage.sh

studyOverflow's picture

Add files using upload-large-folder tool

a685ccc verified 3 months ago

history blame contribute delete

2.12 kB

	#!/bin/bash
	# E-GRPO (G2RPO) training script for QwenImage
	# Based on finetune_mergestep.sh configuration

	set -e

	# GPU 6 is faulty - use only the 6 confirmed working GPUs: 0,1,2,3,4,5
	# Force set these values (override any existing env vars)
	export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
	NPROC_PER_NODE=6
	NNODES=${NNODES:-1}
	NODE_RANK=${NODE_RANK:-0}
	MASTER_ADDR=${MASTER_ADDR:-localhost}
	MASTER_PORT=${MASTER_PORT:-29501}

	# NCCL configuration to fix communication hangs
	export NCCL_P2P_DISABLE=1 # Disable P2P (peer-to-peer GPU communication)
	export NCCL_IB_DISABLE=1 # Disable InfiniBand
	export NCCL_SHM_DISABLE=0 # Keep shared memory enabled
	export NCCL_SOCKET_IFNAME=lo # Use localhost interface
	export NCCL_DEBUG=WARN # Show warnings

	# Change to source_code directory
	cd "$(dirname "$0")/.."

	echo "=========================================="
	echo "E-GRPO Training for QwenImage"
	echo "=========================================="
	echo "Nodes: $NNODES"
	echo "GPUs per node: $NPROC_PER_NODE"
	echo "Master addr: $MASTER_ADDR"
	echo "Master port: $MASTER_PORT"
	echo "=========================================="

	# Run training
	python -m torch.distributed.run \
	--nnodes=$NNODES \
	--nproc_per_node=$NPROC_PER_NODE \
	--node_rank=$NODE_RANK \
	--master_addr=$MASTER_ADDR \
	--master_port=$MASTER_PORT \
	fastvideo/train_g2rpo_qwenimage_merge.py \
	--pretrained_model_name_or_path ./data/QwenImage \
	--data_json_path ./data/qwenimage_rl_embeddings/videos2caption.json \
	--output_dir ./output/g2rpo_qwenimage \
	--hps_path ./data/hps/HPS_v2.1_compressed.pt \
	--hps_clip_path ./data/hps/open_clip_pytorch_model.bin \
	--h 1024 \
	--w 1024 \
	--sampling_steps 16 \
	--eta 0.7 \
	--shift 3.0 \
	--num_generations 12 \
	--learning_rate 2e-6 \
	--max_train_steps 301 \
	--checkpointing_steps 50 \
	--eta_step_list 0 1 2 3 4 5 6 7 \
	--eta_step_merge_list 1 1 1 2 2 2 3 3 \
	--granular_list 1 \
	--init_same_noise \
	--clip_range 1e-4 \
	--adv_clip_max 5.0 \
	--use_hpsv2

	echo "Training completed!"