multishot / multi_view /train_single.sh
PencilHu's picture
Upload folder using huggingface_hub
85752bc verified
export http_proxy=http://10.63.229.53:8891
export https_proxy=http://10.63.229.53:8891
export HTTP_PROXY=http://10.63.229.53:8891
export HTTPS_PROXY=http://10.63.229.53:8891
export NO_PROXY=localhost,127.0.0.1,::1
export no_proxy=localhost,127.0.0.1,::1
export http_proxy=agent.baidu.com:8188
export https_proxy=agent.baidu.com:8188
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
PROJECT_NAME=Wan2.2_5B-Multi_view-normal_rope_480_832-3ref ###piexels
# export WANDB_MODE=offline # 关键:离线
# export WANDB_DIR=./save_checkpoint/${PROJECT_NAME}/wandb_log # 可选:本地保存目录
# cd /root/paddlejob/workspace/qizipeng
# bash /root/paddlejob/workspace/qizipeng/tar.sh
cd /root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view
### 下面的代码 让在多机,仅使用一台机器训练
# 明确告诉后端:主节点就是本机
export MASTER_ADDR=127.0.0.1
export MASTER_PORT=39900 # 换成一个空闲端口也行
# 防止去走 IB/RDMA 或跨网卡
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=lo # 或者写本机实际网卡名,lo 最保险只走回环
export NCCL_CROSS_NIC=0
#(可选)把 elastic/多机相关的残留变量清掉
unset WORLD_SIZE NODE_RANK
# 检查端口占用并杀死进程
if lsof -i :$MASTER_PORT > /dev/null 2>&1; then
echo "[INFO] 端口 $MASTER_PORT 被占用,正在杀死相关进程..."
kill -9 $(lsof -t -i:$MASTER_PORT)
sleep 2
# 再次检查是否成功杀死
if lsof -i :$MASTER_PORT > /dev/null 2>&1; then
echo "[WARNING] 端口 $MASTER_PORT 仍然被占用,可能需要手动处理"
else
echo "[INFO] 端口 $MASTER_PORT 已释放"
fi
else
echo "[INFO] 端口 $MASTER_PORT 未被占用"
fi
#-------------- 写 YAML(优先 yq,回退 sed) --------------
CONF_YAML="/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/conf/multi-view.yaml"
LOG_DIR="/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/ckpts/${PROJECT_NAME}/log"
# 检查目录是否存在,如果不存在就创建
if [ ! -d "$LOG_DIR" ]; then
echo "[$(date)] $LOG_DIR 不存在,正在创建..."
mkdir -p "$LOG_DIR"
else
echo "[$(date)] $LOG_DIR 已存在"
fi
cp -rf "${CONF_YAML}" /root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/ckpts/${PROJECT_NAME}/log
set +e
python train.py \
--dataset_repeat 1 \
--model_id_with_origin_paths "Wan2.2-TI2V-5B:diffusion_pytorch_model*.safetensors,Wan2.2-TI2V-5B:models_t5_umt5-xxl-enc-bf16.pth,Wan2.2-TI2V-5B:Wan2.2_VAE.pth" \
--learning_rate 1e-5 \
--num_epochs 100 \
--num_frames 81 \
--remove_prefix_in_ckpt "pipe.dit." \
--trainable_models "dit" \
--train_yaml "${CONF_YAML}" \
--extra_inputs "cropped_images" \
EXIT_CODE=$?
set -e
# --use_gradient_checkpointing_offload \
# --output_path "/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/ckpts/${PROJECT_NAME}" \
if [[ $EXIT_CODE -ne 0 ]]; then
echo "[parent] training failed with code ${EXIT_CODE}"
echo "[parent] running gpu_hog.py to investigate GPU status..."
# CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python /root/paddlejob/workspace/qizipeng/gpu_hog.py
exit $EXIT_CODE
else
echo "[$(date)] 训练正常结束 ✅"
fi
# CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python /root/paddlejob/workspace/qizipeng/gpu_hog.py