| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| name: e2e_ascend |
|
|
| on: |
| |
| |
| push: |
| branches: |
| - main |
| - v0.* |
| pull_request: |
| branches: |
| - main |
| paths: |
| - ".github/workflows/e2e_ascend.yml" |
| - "**/*.py" |
| - "docs/ascend_tutorial/**" |
| - "examples/**" |
| - "recipe/**" |
| - "tests/special_npu/**" |
| - "tests/special_sanity/**" |
| - "verl/**" |
| - "pyproject.toml" |
| - "requirements-npu.txt" |
| - "setup.py" |
|
|
| |
| concurrency: |
| group: ${{ github.workflow }}-${{ github.ref }} |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} |
|
|
| permissions: |
| contents: read |
|
|
| jobs: |
| test: |
| if: github.repository_owner == 'volcengine' |
| name: verl Ascend test (self-host) |
| runs-on: [self-hosted, npu-0] |
| timeout-minutes: 40 |
| container: |
| image: crispig/verl_npu:cann8.1rc1-py3.10-torch2.5.1-vllm-ascend0.7.3.post1-mindspeed0121-250731 |
| volumes: |
| - /usr/local/dcmi:/usr/local/dcmi |
| - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi |
| - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ |
| - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info |
| - /etc/ascend_install.info:/etc/ascend_install.info |
| - /data00/dataset:/github/home/dataset |
| - /data00/models:/github/home/models |
| |
| |
| options: >- |
| --device /dev/davinci0 |
| --device /dev/davinci_manager |
| --device /dev/devmm_svm |
| --device /dev/hisi_hdc |
| --network host |
| --privileged |
| --shm-size 16g |
| env: |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" |
| HF_ENDPOINT: "https://hf-mirror.com" |
| HF_HUB_ENABLE_HF_TRANSFER: "0" |
| steps: |
| - name: Check npu and CANN info |
| run: | |
| cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info |
| npu-smi info |
| - name: Checkout volcengine/verl repo |
| uses: actions/checkout@v4 |
| - name: Install the current repository |
| run: | |
| pip3 install hf_transfer peft |
| pip3 install -r requirements-npu.txt |
| pip install -e . |
| - name: Install torchvision |
| run: | |
| pip install torchvision==0.20.1+cpu --index-url https://download.pytorch.org/whl/cpu |
| - name: Uninstall Triton |
| run: | |
| pip uninstall -y triton |
| - name: Preprocess gsm8k dataset |
| run: | |
| python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/dataset/openai/gsm8k |
| - name: Preprocess geo3k dataset |
| run: | |
| python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/dataset/hiyouga/geometry3k |
| - name: Running gsm8k e2e qwen3 training tests with PPO on ASCEND NPU |
| run: | |
| ray stop --force |
| bash tests/special_npu/run_qwen3_06b_ppo.sh |
| rm -rf $HOME/ckpts |
| - name: Running gsm8k e2e training tests with peft sft on ASCEND NPU |
| run: | |
| ray stop --force |
| bash tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh |
| rm -rf $HOME/ckpts |
| - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU |
| run: | |
| ray stop --force |
| bash tests/special_npu/run_qwen2_5_05b_grpo.sh |
| rm -rf $HOME/ckpts |
| - name: Running geo3k e2e training tests with GRPO on ASCEND NPU |
| run: | |
| ray stop --force |
| bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh |
| rm -rf $HOME/ckpts |
| - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU |
| run: | |
| ray stop --force |
| bash tests/special_npu/run_qwen2_5_05b_dapo.sh |
| rm -rf $HOME/ckpts |
| - name: Running gsm8k e2e training tests with GRPO MindSpeed on ASCEND NPU |
| run: | |
| ray stop --force |
| USE_DIST_CKPT=True bash tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh |
| rm -rf $HOME/dist_ckpt/qwen2_5_05b_grpo_mindspeed |
| rm -rf $HOME/ckpts |
| - name: Running NPU profiling unit tests |
| run: | |
| ray stop --force |
| pytest -s -x tests/utils/test_special_mstx_profile.py |