| export http_proxy=http://bj-rd-proxy.byted.org:3128 | |
| export https_proxy=http://bj-rd-proxy.byted.org:3128 | |
| # source /mnt/bn/robotics-data-hl/jirong/git/calvin_incontext/bin/activate | |
| CUR_DIR=$(cd $(dirname $0); pwd) | |
| cd $CUR_DIR | |
| cd /mnt/bn/roboicl-jirong/codebase/DeLVM | |
| ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) | |
| port=${ports[0]} | |
| echo "total workers: ${ARNOLD_WORKER_NUM}" | |
| echo "cur worker id: ${ARNOLD_ID}" | |
| echo "gpus per worker: ${ARNOLD_WORKER_GPU}" | |
| echo "master ip: ${METIS_WORKER_0_HOST}" | |
| echo "master port: ${port}" | |
| export OMP_NUM_THREADS=8 | |
| export NCCL_IB_DISABLE=0 | |
| export NCCL_IB_GID_INDEX=3 | |
| export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE} | |
| export NCCL_SOCKET_IFNAME=eth0 | |
| export NCCL_DEBUG=INFO | |
| # a=4 | |
| # node_rank=$((ARNOLD_ID-a)) | |
| # script_name=$0 | |
| # bs=$1 | |
| # nw=$2 | |
| # lr=$3 | |
| # folder=$4 | |
| torchrun \ | |
| --nnodes $ARNOLD_WORKER_NUM \ | |
| --node_rank $ARNOLD_ID \ | |
| --nproc_per_node 8 \ | |
| --master_addr $METIS_WORKER_0_HOST \ | |
| --master_port $port \ | |
| dist_train_vqgan.py --batch_size 16 --epoch 5000000 --log_folder f16_192_real_calvin_robot_datacomp_1e-5_disc_start_0_weight_0.2 | |
| # torchrun \ | |
| # --nnodes 4 \ | |
| # --node_rank $ARNOLD_ID \ | |
| # --nproc_per_node 8 \ | |
| # --master_addr $METIS_WORKER_0_HOST \ | |
| # --master_port $port \ | |
| # dist_train_vqgan.py --batch_size 30 --epoch 500000 --log_folder f16_192_cofinetune | |