File size: 3,253 Bytes
8b306b3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | #!/bin/bash
find_available_port() {
local start_port="${1:-6666}"
local end_port="${2:-8888}"
python3 - "$start_port" "$end_port" <<'PY'
import socket
import sys
start_port = int(sys.argv[1])
end_port = int(sys.argv[2])
for port in range(start_port, end_port):
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(("", port))
sock.close()
print(port)
raise SystemExit(0)
except OSError:
continue
print(start_port)
PY
}
lance_setup_common_env() {
export EXP_HW_20250819="${EXP_HW_20250819:-False}"
echo "EXP_HW_20250819: $EXP_HW_20250819"
export POSITION_EMBEDDING_3D_VERSION="${POSITION_EMBEDDING_3D_VERSION:-v2}"
echo "(shell) POSITION_EMBEDDING_3D_VERSION: $POSITION_EMBEDDING_3D_VERSION"
# Default to async CUDA execution for benchmark/inference throughput.
# Override with CUDA_LAUNCH_BLOCKING=1 only when debugging kernel failures.
export CUDA_LAUNCH_BLOCKING="${CUDA_LAUNCH_BLOCKING:-0}"
export NCCL_DEBUG="${NCCL_DEBUG:-VERSION}"
export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC="${TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC:-900}"
}
lance_setup_distributed_env() {
local num_gpus="${1:-1}"
local default_main_process_port
local has_explicit_main_process_port=0
NUM_GPUS="$num_gpus"
if [ -n "$MAIN_PROCESS_PORT" ]; then
has_explicit_main_process_port=1
fi
if [ -n "$ARNOLD_WORKER_NUM" ]; then
echo "检测到 ARNOLD 平台环境"
NUM_MACHINES="${NUM_MACHINES:-$ARNOLD_WORKER_NUM}"
MACHINE_RANK="${MACHINE_RANK:-${ARNOLD_ID:-0}}"
MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-${ARNOLD_WORKER_0_HOST:-127.0.0.1}}"
default_main_process_port="${ARNOLD_WORKER_0_PORT:-6666}"
if [ "$has_explicit_main_process_port" -eq 1 ]; then
:
elif [ "${NUM_MACHINES}" = "1" ]; then
MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
else
MAIN_PROCESS_PORT="$default_main_process_port"
echo "多机任务使用平台 rendezvous 端口: $MAIN_PROCESS_PORT"
fi
else
echo "使用本地环境配置"
NUM_MACHINES="${NUM_MACHINES:-1}"
MACHINE_RANK="${MACHINE_RANK:-0}"
MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-127.0.0.1}"
default_main_process_port=6666
if [ "$has_explicit_main_process_port" -eq 1 ]; then
:
else
MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
fi
fi
TOTAL_RANK=$((NUM_MACHINES * NUM_GPUS))
export NUM_GPUS NUM_MACHINES MACHINE_RANK MAIN_PROCESS_IP MAIN_PROCESS_PORT TOTAL_RANK
echo "NUM_MACHINES: $NUM_MACHINES"
echo "NUM_GPUS: $NUM_GPUS"
echo "TOTAL_RANK: $TOTAL_RANK"
echo "MACHINE_RANK: $MACHINE_RANK"
echo "MAIN_PROCESS_IP: $MAIN_PROCESS_IP"
echo "MAIN_PROCESS_PORT: $MAIN_PROCESS_PORT"
}
lance_setup_shard_env() {
local num_shard="${1:-1}"
NUM_SHARD="$num_shard"
NUM_REPLICATE=$((TOTAL_RANK / NUM_SHARD))
export NUM_SHARD NUM_REPLICATE
echo "NUM_REPLICATE: $NUM_REPLICATE"
echo "NUM_SHARD: $NUM_SHARD"
}
|