#!/usr/bin/env bash export http_proxy=http://bj-rd-proxy.byted.org:3128 export https_proxy=http://bj-rd-proxy.byted.org:3128 # source /mnt/bn/robotics-data-hl/jirong/git/calvin_incontext/bin/activate CUR_DIR=$(cd $(dirname $0); pwd) cd $CUR_DIR cd /mnt/bn/roboicl-jirong/codebase/DeLVM ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) port=${ports[0]} echo "total workers: ${ARNOLD_WORKER_NUM}" echo "cur worker id: ${ARNOLD_ID}" echo "gpus per worker: ${ARNOLD_WORKER_GPU}" echo "master ip: ${METIS_WORKER_0_HOST}" echo "master port: ${port}" export OMP_NUM_THREADS=8 export NCCL_IB_DISABLE=0 export NCCL_IB_GID_INDEX=3 export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE} export NCCL_SOCKET_IFNAME=eth0 export NCCL_DEBUG=INFO # a=4 # node_rank=$((ARNOLD_ID-a)) # script_name=$0 # bs=$1 # nw=$2 # lr=$3 # folder=$4 torchrun \ --nnodes $ARNOLD_WORKER_NUM \ --node_rank $ARNOLD_ID \ --nproc_per_node 8 \ --master_addr $METIS_WORKER_0_HOST \ --master_port $port \ dist_train_vqgan.py --batch_size 16 --epoch 5000000 --log_folder f16_192_real_calvin_robot_datacomp_1e-5_disc_start_0_weight_0.2 # torchrun \ # --nnodes 4 \ # --node_rank $ARNOLD_ID \ # --nproc_per_node 8 \ # --master_addr $METIS_WORKER_0_HOST \ # --master_port $port \ # dist_train_vqgan.py --batch_size 30 --epoch 500000 --log_folder f16_192_cofinetune