DeLVM / dist_train_vqgan.sh
jirong's picture
Upload folder using huggingface_hub
ee3e701 verified
#!/usr/bin/env bash
export http_proxy=http://bj-rd-proxy.byted.org:3128
export https_proxy=http://bj-rd-proxy.byted.org:3128
# source /mnt/bn/robotics-data-hl/jirong/git/calvin_incontext/bin/activate
CUR_DIR=$(cd $(dirname $0); pwd)
cd $CUR_DIR
cd /mnt/bn/roboicl-jirong/codebase/DeLVM
ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
port=${ports[0]}
echo "total workers: ${ARNOLD_WORKER_NUM}"
echo "cur worker id: ${ARNOLD_ID}"
echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
echo "master ip: ${METIS_WORKER_0_HOST}"
echo "master port: ${port}"
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
# a=4
# node_rank=$((ARNOLD_ID-a))
# script_name=$0
# bs=$1
# nw=$2
# lr=$3
# folder=$4
torchrun \
--nnodes $ARNOLD_WORKER_NUM \
--node_rank $ARNOLD_ID \
--nproc_per_node 8 \
--master_addr $METIS_WORKER_0_HOST \
--master_port $port \
dist_train_vqgan.py --batch_size 16 --epoch 5000000 --log_folder f16_192_real_calvin_robot_datacomp_1e-5_disc_start_0_weight_0.2
# torchrun \
# --nnodes 4 \
# --node_rank $ARNOLD_ID \
# --nproc_per_node 8 \
# --master_addr $METIS_WORKER_0_HOST \
# --master_port $port \
# dist_train_vqgan.py --batch_size 30 --epoch 500000 --log_folder f16_192_cofinetune