| #!/bin/bash |
|
|
| find_available_port() { |
| local start_port="${1:-6666}" |
| local end_port="${2:-8888}" |
|
|
| python3 - "$start_port" "$end_port" <<'PY' |
| import socket |
| import sys |
|
|
| start_port = int(sys.argv[1]) |
| end_port = int(sys.argv[2]) |
|
|
| for port in range(start_port, end_port): |
| try: |
| sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
| sock.bind(("", port)) |
| sock.close() |
| print(port) |
| raise SystemExit(0) |
| except OSError: |
| continue |
|
|
| print(start_port) |
| PY |
| } |
|
|
|
|
| lance_setup_common_env() { |
| export EXP_HW_20250819="${EXP_HW_20250819:-False}" |
| echo "EXP_HW_20250819: $EXP_HW_20250819" |
|
|
| export POSITION_EMBEDDING_3D_VERSION="${POSITION_EMBEDDING_3D_VERSION:-v2}" |
| echo "(shell) POSITION_EMBEDDING_3D_VERSION: $POSITION_EMBEDDING_3D_VERSION" |
|
|
| |
| |
| export CUDA_LAUNCH_BLOCKING="${CUDA_LAUNCH_BLOCKING:-0}" |
| export NCCL_DEBUG="${NCCL_DEBUG:-VERSION}" |
| export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC="${TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC:-900}" |
| } |
|
|
|
|
| lance_setup_distributed_env() { |
| local num_gpus="${1:-1}" |
| local default_main_process_port |
| local has_explicit_main_process_port=0 |
|
|
| NUM_GPUS="$num_gpus" |
|
|
| if [ -n "$MAIN_PROCESS_PORT" ]; then |
| has_explicit_main_process_port=1 |
| fi |
|
|
| if [ -n "$ARNOLD_WORKER_NUM" ]; then |
| echo "检测到 ARNOLD 平台环境" |
| NUM_MACHINES="${NUM_MACHINES:-$ARNOLD_WORKER_NUM}" |
| MACHINE_RANK="${MACHINE_RANK:-${ARNOLD_ID:-0}}" |
| MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-${ARNOLD_WORKER_0_HOST:-127.0.0.1}}" |
| default_main_process_port="${ARNOLD_WORKER_0_PORT:-6666}" |
|
|
| if [ "$has_explicit_main_process_port" -eq 1 ]; then |
| : |
| elif [ "${NUM_MACHINES}" = "1" ]; then |
| MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")" |
| else |
| MAIN_PROCESS_PORT="$default_main_process_port" |
| echo "多机任务使用平台 rendezvous 端口: $MAIN_PROCESS_PORT" |
| fi |
| else |
| echo "使用本地环境配置" |
| NUM_MACHINES="${NUM_MACHINES:-1}" |
| MACHINE_RANK="${MACHINE_RANK:-0}" |
| MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-127.0.0.1}" |
| default_main_process_port=6666 |
|
|
| if [ "$has_explicit_main_process_port" -eq 1 ]; then |
| : |
| else |
| MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")" |
| fi |
| fi |
|
|
| TOTAL_RANK=$((NUM_MACHINES * NUM_GPUS)) |
|
|
| export NUM_GPUS NUM_MACHINES MACHINE_RANK MAIN_PROCESS_IP MAIN_PROCESS_PORT TOTAL_RANK |
|
|
| echo "NUM_MACHINES: $NUM_MACHINES" |
| echo "NUM_GPUS: $NUM_GPUS" |
| echo "TOTAL_RANK: $TOTAL_RANK" |
| echo "MACHINE_RANK: $MACHINE_RANK" |
| echo "MAIN_PROCESS_IP: $MAIN_PROCESS_IP" |
| echo "MAIN_PROCESS_PORT: $MAIN_PROCESS_PORT" |
| } |
|
|
|
|
| lance_setup_shard_env() { |
| local num_shard="${1:-1}" |
|
|
| NUM_SHARD="$num_shard" |
| NUM_REPLICATE=$((TOTAL_RANK / NUM_SHARD)) |
|
|
| export NUM_SHARD NUM_REPLICATE |
|
|
| echo "NUM_REPLICATE: $NUM_REPLICATE" |
| echo "NUM_SHARD: $NUM_SHARD" |
| } |
|
|