| export http_proxy=http://10.63.229.53:8891 |
| export https_proxy=http://10.63.229.53:8891 |
| export HTTP_PROXY=http://10.63.229.53:8891 |
| export HTTPS_PROXY=http://10.63.229.53:8891 |
| export NO_PROXY=localhost,127.0.0.1,::1 |
| export no_proxy=localhost,127.0.0.1,::1 |
|
|
| export http_proxy=agent.baidu.com:8188 |
| export https_proxy=agent.baidu.com:8188 |
|
|
| export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 |
|
|
| PROJECT_NAME=Wan2.2_5B-Multi_view-normal_rope_1280_720-3ref_dianshang |
| |
| |
| |
| |
|
|
| cd /root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view |
| |
| |
| export MASTER_ADDR=127.0.0.1 |
| export MASTER_PORT=39900 |
| |
| export NCCL_IB_DISABLE=1 |
| export NCCL_SOCKET_IFNAME=lo |
| export NCCL_CROSS_NIC=0 |
| |
| unset WORLD_SIZE NODE_RANK |
|
|
| |
| if lsof -i :$MASTER_PORT > /dev/null 2>&1; then |
| echo "[INFO] 端口 $MASTER_PORT 被占用,正在杀死相关进程..." |
| kill -9 $(lsof -t -i:$MASTER_PORT) |
| sleep 2 |
| |
| if lsof -i :$MASTER_PORT > /dev/null 2>&1; then |
| echo "[WARNING] 端口 $MASTER_PORT 仍然被占用,可能需要手动处理" |
| else |
| echo "[INFO] 端口 $MASTER_PORT 已释放" |
| fi |
| else |
| echo "[INFO] 端口 $MASTER_PORT 未被占用" |
| fi |
|
|
| |
| CONF_YAML="/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/conf/multi-view.yaml" |
|
|
| LOG_DIR="/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/ckpts/${PROJECT_NAME}/log" |
| |
| if [ ! -d "$LOG_DIR" ]; then |
| echo "[$(date)] $LOG_DIR 不存在,正在创建..." |
| mkdir -p "$LOG_DIR" |
| else |
| echo "[$(date)] $LOG_DIR 已存在" |
| fi |
| cp -rf "${CONF_YAML}" /root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/ckpts/${PROJECT_NAME}/log |
|
|
| set +e |
| accelerate launch --config_file="/root/paddlejob/workspace/qizipeng/baidu/personal-code/Multi-view/multi_view/conf/accelerate_config_5B.yaml" \ |
| --main_process_port $MASTER_PORT \ |
| train.py \ |
| --dataset_repeat 1 \ |
| --model_id_with_origin_paths "Wan2.2-TI2V-5B:diffusion_pytorch_model*.safetensors,Wan2.2-TI2V-5B:models_t5_umt5-xxl-enc-bf16.pth,Wan2.2-TI2V-5B:Wan2.2_VAE.pth" \ |
| --learning_rate 1e-5 \ |
| --num_epochs 100 \ |
| --num_frames 81 \ |
| --remove_prefix_in_ckpt "pipe.dit." \ |
| --trainable_models "dit" \ |
| --train_yaml "${CONF_YAML}" \ |
| --extra_inputs "cropped_images" \ |
|
|
| EXIT_CODE=$? |
| set -e |
|
|
| |
| |
|
|
|
|
|
|
| if [[ $EXIT_CODE -ne 0 ]]; then |
| echo "[parent] training failed with code ${EXIT_CODE}" |
| echo "[parent] running gpu_hog.py to investigate GPU status..." |
| CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python /root/paddlejob/workspace/qizipeng/gpu_hog.py |
| exit $EXIT_CODE |
| else |
| echo "[$(date)] 训练正常结束 ✅" |
| fi |
|
|
| CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python /root/paddlejob/workspace/qizipeng/gpu_hog.py |
|
|