File size: 3,253 Bytes
8b306b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/bash

find_available_port() {
    local start_port="${1:-6666}"
    local end_port="${2:-8888}"

    python3 - "$start_port" "$end_port" <<'PY'
import socket
import sys

start_port = int(sys.argv[1])
end_port = int(sys.argv[2])

for port in range(start_port, end_port):
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.bind(("", port))
        sock.close()
        print(port)
        raise SystemExit(0)
    except OSError:
        continue

print(start_port)
PY
}


lance_setup_common_env() {
    export EXP_HW_20250819="${EXP_HW_20250819:-False}"
    echo "EXP_HW_20250819: $EXP_HW_20250819"

    export POSITION_EMBEDDING_3D_VERSION="${POSITION_EMBEDDING_3D_VERSION:-v2}"
    echo "(shell) POSITION_EMBEDDING_3D_VERSION: $POSITION_EMBEDDING_3D_VERSION"

    # Default to async CUDA execution for benchmark/inference throughput.
    # Override with CUDA_LAUNCH_BLOCKING=1 only when debugging kernel failures.
    export CUDA_LAUNCH_BLOCKING="${CUDA_LAUNCH_BLOCKING:-0}"
    export NCCL_DEBUG="${NCCL_DEBUG:-VERSION}"
    export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC="${TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC:-900}"
}


lance_setup_distributed_env() {
    local num_gpus="${1:-1}"
    local default_main_process_port
    local has_explicit_main_process_port=0

    NUM_GPUS="$num_gpus"

    if [ -n "$MAIN_PROCESS_PORT" ]; then
        has_explicit_main_process_port=1
    fi

    if [ -n "$ARNOLD_WORKER_NUM" ]; then
        echo "检测到 ARNOLD 平台环境"
        NUM_MACHINES="${NUM_MACHINES:-$ARNOLD_WORKER_NUM}"
        MACHINE_RANK="${MACHINE_RANK:-${ARNOLD_ID:-0}}"
        MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-${ARNOLD_WORKER_0_HOST:-127.0.0.1}}"
        default_main_process_port="${ARNOLD_WORKER_0_PORT:-6666}"

        if [ "$has_explicit_main_process_port" -eq 1 ]; then
            :
        elif [ "${NUM_MACHINES}" = "1" ]; then
            MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
        else
            MAIN_PROCESS_PORT="$default_main_process_port"
            echo "多机任务使用平台 rendezvous 端口: $MAIN_PROCESS_PORT"
        fi
    else
        echo "使用本地环境配置"
        NUM_MACHINES="${NUM_MACHINES:-1}"
        MACHINE_RANK="${MACHINE_RANK:-0}"
        MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-127.0.0.1}"
        default_main_process_port=6666

        if [ "$has_explicit_main_process_port" -eq 1 ]; then
            :
        else
            MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
        fi
    fi

    TOTAL_RANK=$((NUM_MACHINES * NUM_GPUS))

    export NUM_GPUS NUM_MACHINES MACHINE_RANK MAIN_PROCESS_IP MAIN_PROCESS_PORT TOTAL_RANK

    echo "NUM_MACHINES: $NUM_MACHINES"
    echo "NUM_GPUS: $NUM_GPUS"
    echo "TOTAL_RANK: $TOTAL_RANK"
    echo "MACHINE_RANK: $MACHINE_RANK"
    echo "MAIN_PROCESS_IP: $MAIN_PROCESS_IP"
    echo "MAIN_PROCESS_PORT: $MAIN_PROCESS_PORT"
}


lance_setup_shard_env() {
    local num_shard="${1:-1}"

    NUM_SHARD="$num_shard"
    NUM_REPLICATE=$((TOTAL_RANK / NUM_SHARD))

    export NUM_SHARD NUM_REPLICATE

    echo "NUM_REPLICATE: $NUM_REPLICATE"
    echo "NUM_SHARD: $NUM_SHARD"
}