| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| name: e2e_sft |
|
|
| on: |
| |
| |
| push: |
| branches: |
| - main |
| - v0.* |
| pull_request: |
| branches: |
| - main |
| - v0.* |
| paths: |
| - "**/*.py" |
| |
| - "!examples/**" |
| - "!tests/**" |
| - "!verl/trainer/main_*.py" |
| - "!verl/trainer/fsdp_sft_trainer.py" |
| |
| - "!recipe/**" |
| |
| - "!verl/workers/**/megatron_*.py" |
| |
| - ".github/workflows/e2e_sft.yml" |
| - "examples/data_preprocess/gsm8k.py" |
| - "tests/special_e2e/sft" |
| - "verl/trainer/fsdp_sft_trainer.py" |
| - "verl/trainer/config/sft_trainer.yaml" |
|
|
| |
| concurrency: |
| group: ${{ github.workflow }}-${{ github.ref }} |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} |
|
|
| |
| permissions: |
| contents: read |
|
|
| env: |
| IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2" |
| DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" |
|
|
| jobs: |
| setup: |
| if: github.repository_owner == 'volcengine' |
| runs-on: ubuntu-latest |
| outputs: |
| runner-label: ${{ steps.create-runner.outputs.runner-label }} |
| mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }} |
| steps: |
| - uses: actions/checkout@v4 |
| - id: create-runner |
| uses: volcengine/vemlp-github-runner@v1 |
| with: |
| mode: "create" |
| faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" |
| mlp-image: "${{ env.IMAGE }}" |
| e2e_sft: |
| needs: setup |
| runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] |
| timeout-minutes: 25 |
| env: |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" |
| HF_ENDPOINT: "https://hf-mirror.com" |
| HF_HUB_ENABLE_HF_TRANSFER: "0" |
| steps: |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 |
| with: |
| fetch-depth: 0 |
| - name: Install the current repository |
| run: | |
| pip3 install peft |
| pip3 install --no-deps -e .[test,gpu] |
| - name: Prepare gsm8k dataset |
| run: | |
| ray stop --force |
| python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm |
| run: | |
| ray stop --force |
| bash tests/special_e2e/sft/run_sft.sh |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs w/o rmpad using function rm |
| run: | |
| ray stop --force |
| RM_PAD=False bash tests/special_e2e/sft/run_sft.sh |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism |
| run: | |
| ray stop --force |
| SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh |
| - name: Check loss difference between sequence parallel vs. default implementation |
| run: | |
| ray stop --force |
| ENTRYPOINT="tests/special_e2e/sft/test_sp_loss_match.py" SP_SIZE=2 bash tests/special_e2e/sft/run_sft.sh |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with sequence parallism and liger |
| run: | |
| ray stop --force |
| SP_SIZE=2 LIGER=True bash tests/special_e2e/sft/run_sft.sh |
| - name: Running GSM8K E2E training tests with LoRA |
| run: | |
| ray stop --force |
| LORA_RANK=32 bash tests/special_e2e/sft/run_sft.sh |
| - name: Run GSM8K E2E training and resume tests resuming from the checkpoint manager |
| run: | |
| ray stop --force |
| LORA_RANK=32 RESUME_MODE=auto TOTAL_TRAIN_STEP=2 bash tests/special_e2e/sft/run_sft.sh |
| |
| - name: Prepare gsm8k dataset |
| run: | |
| ray stop --force |
| python3 examples/data_preprocess/gsm8k_multiturn_sft.py --local_dataset_path ${HOME}/models/hf_data/gsm8k |
| - name: Running GSM8K E2E training tests with multiturn and various configs and compare results |
| run: | |
| bash tests/special_e2e/sft/test_sft_engine_all.sh |
| |
| |
| cleanup: |
| runs-on: ubuntu-latest |
| needs: [setup, e2e_sft] |
| if: always() |
| steps: |
| - id: destroy-runner |
| uses: volcengine/vemlp-github-runner@v1 |
| with: |
| mode: "destroy" |
| faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" |
| mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}" |
|
|