# Copyright (c) 2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "[debug] CICD NeMo" on: schedule: - cron: 0 0 * * * - cron: "*/5 * * * *" # Runs every 5 minutes push: branches: - main workflow_dispatch: inputs: test_to_run: required: false default: all type: string description: Comma-separated list of tests to run. Use "all" to run the full test suite. jobs: pre-flight: runs-on: ubuntu-latest outputs: test_to_run: ${{ steps.test_to_run.outputs.main }} is_ci_workload: ${{ steps.is_ci_workload.outputs.main }} no_fail_fast: ${{ steps.no_fail_fast.outputs.main }} components_to_run: ${{ steps.components_to_run.outputs.main }} env: TESTS_TO_RUN: ${{ inputs.test_to_run }} EVENT_NAME: ${{ github.event_name }} HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }} steps: - name: Checkout branch uses: actions/checkout@v4 with: fetch-depth: 0 - name: Select components to run id: components_to_run run: | pip install -U pip pip install git-python if [[ "$EVENT_NAME" == "pull_request" ]]; then python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }} else echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json fi components_to_run=$(cat test_modules.json) echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT" - name: Select tests to run id: test_to_run run: | # For manual dispatch, we replace `all` with the actual job names if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then TESTS_TO_RUN=$TESTS_TO_RUN # For correctly labeled PR, we replace `all` with the actual job names elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then TESTS_TO_RUN=all # For incorrectly labeled PR, run no tests elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then TESTS_TO_RUN="" # For push events, run all tests. This is so that we can generate coverage # on branch `main`. elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then TESTS_TO_RUN=all else echo "Unsupported event_name $EVENT_NAME provided". exit 1 fi parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")') echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT" - name: Check if this is a CI workload shell: bash id: is_ci_workload run: | branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then is_ci_workload=true echo "main=true" | tee -a "$GITHUB_OUTPUT" else is_ci_workload=false fi echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT" - name: Check if no-fail-fast is set shell: bash id: no_fail_fast env: HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }} run: | if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then no_fail_fast=true else no_fail_fast=false fi echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT" code-linting: if: needs.pre-flight.outputs.test_to_run != '[]' needs: [pre-flight] uses: ./.github/workflows/code-linting.yml cicd-wait-in-queue: needs: [pre-flight] runs-on: ubuntu-latest environment: test if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.is_ci_workload == 'false' steps: - name: Running CI tests run: | echo "Running CI tests" cicd-test-container-build: uses: ./.github/workflows/_build_container.yml needs: [pre-flight, code-linting, cicd-wait-in-queue] if: | needs.pre-flight.outputs.test_to_run != '[]' && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: image-name: nemo_container dockerfile: docker/Dockerfile.ci # cicd-import-tests: # if: | # needs.pre-flight.outputs.test_to_run != '[]' # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # needs: [cicd-test-container-build, pre-flight] # runs-on: self-hosted-azure-gpus-1 # steps: # - name: Create UUID # id: uuid # run: | # echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT" # - name: Checkout NeMo # uses: actions/checkout@v2 # with: # repository: NVIDIA/NeMo # path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo # - name: Run some checks # run: | # docker run \ # --rm \ # --device=/dev/nvidia0 \ # --gpus all \ # --shm-size=8g \ # --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \ # --env TRANSFORMERS_OFFLINE=0 \ # --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\ # # PyTorch Lightning version # python -c "import lightning.pytorch; print(lightning.pytorch.__version__)" # # PyTorch Lightning DDP Checks # CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" # # Basic Import Checks # python tests/core_ptl/check_imports.py --domain asr # python tests/core_ptl/check_imports.py --domain nlp # python tests/core_ptl/check_imports.py --domain tts # ' # L0_Setup_Test_Data_And_Models: # needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue] # runs-on: self-hosted-azure # if: | # needs.pre-flight.outputs.test_to_run != '[]' # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # steps: # - name: Checkout # uses: actions/checkout@v4 # with: # path: ${{ github.run_id }} # - name: main # uses: NVIDIA/NeMo/.github/actions/test-template@main # with: # runner: ${{ runner.name }} # script: L0_Setup_Test_Data_And_Models # tests_to_run: '["L0_Setup_Test_Data_And_Models"]' # cicd-main-unit-tests: # needs: [pre-flight, cicd-test-container-build] # uses: ./.github/workflows/cicd-main-unit-tests.yml # if: | # needs.pre-flight.outputs.test_to_run != '[]' # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # with: # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} # cicd-main-export-deploy: # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] # uses: ./.github/workflows/cicd-main-export-deploy.yml # if: | # ( # needs.pre-flight.outputs.test_to_run != '[]' # && ( # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy') # ) # ) # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # with: # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} # cicd-main-speech: # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] # uses: ./.github/workflows/cicd-main-speech.yml # if: | # ( # needs.pre-flight.outputs.test_to_run != '[]' # && ( # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech') # ) # ) # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # with: # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} # cicd-main-automodel: # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] # uses: ./.github/workflows/cicd-main-automodel.yml # if: | # ( # needs.pre-flight.outputs.test_to_run != '[]' # && ( # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel') # ) # ) # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # with: # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} # cicd-main-nemo2: # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] # uses: ./.github/workflows/cicd-main-nemo2.yml # if: | # ( # needs.pre-flight.outputs.test_to_run != '[]' # && ( # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2') # || needs.pre-flight.outputs.components_to_run == '["all"]' # ) # ) # && ( # success() # || ( # needs.cicd-wait-in-queue.result == 'skipped' # && needs.pre-flight.outputs.is_ci_workload == 'true' # ) # ) # && !cancelled() # with: # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} Nemo_CICD_Test_Debug: needs: - pre-flight - cicd-test-container-build # - cicd-import-tests # - L0_Setup_Test_Data_And_Models # - cicd-main-unit-tests # - cicd-main-nemo2 # - cicd-main-export-deploy # - cicd-main-automodel # - cicd-main-speech if: always() runs-on: ubuntu-latest permissions: write-all steps: - name: Checkout uses: actions/checkout@v4 - name: Get workflow result id: result env: GH_TOKEN: ${{ github.token }} RUN_ID: ${{ github.run_id }} run: | # Get workflow run details and check job conclusions NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length') NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length') if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then RESULT="success" else RESULT="failure" fi # Output the final status echo "code=$RESULT" | tee -a $GITHUB_OUTPUT - name: Checkout for GH CLI uses: actions/checkout@v4 - name: Remove label if not cancelled if: ${{ steps.result.outputs.code != 'cancelled' && github.event.label.name == 'Run CICD' && github.event.pull_request.head.repo.full_name == github.repository }} env: GH_TOKEN: ${{ github.token }} PR_NUMBER: ${{ github.event.number }} run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD" - name: Pipeline successful, add PR comment if: ${{ always() && steps.result.outputs.code == 'success' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }} uses: peter-evans/create-or-update-comment@v4 env: SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} REPOSITORY: ${{ github.repository }} RUN_ID: ${{ github.run_id }} with: issue-number: ${{ github.event.number }} body: | [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋, We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully. So it might be time to merge this PR or get some approvals. Due to a major CI change, merges are currently handled by the automation team. We will reach out to you quickly to merge this PR, but you can always reach us with the following handles: //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary" if: ${{ always() && steps.result.outputs.code == 'failure' && env.SLACK_WEBHOOK != '' }} env: SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPOSITORY: ${{ github.repository }} RUN_ID: ${{ github.run_id }} PR_NUMBER: ${{ github.event.number }} SERVER_URL: ${{ github.server_url }} run: | set -x pip install PyGithub export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} python .github/scripts/notify.py - name: Exit if: ${{ always() }} env: RESULT: ${{ steps.result.outputs.code }} run: | if [ $RESULT == "success" ]; then exit 0 else exit 1 fi Coverage: runs-on: ubuntu-latest needs: [Nemo_CICD_Test_Debug] strategy: matrix: flag: [unit-test, e2e] if: | ( success() || needs.Nemo_CICD_Test.result == 'success' ) && !cancelled() steps: - name: Checkout uses: actions/checkout@v4 - name: Download coverage reports of current branch uses: actions/download-artifact@v4 with: pattern: coverage-${{ matrix.flag }}-* - name: Get total coverage of current branch shell: bash -x -e -u -o pipefail {0} if: always() run: | pip install coverage ls -al . ls -al coverage-*/ coverage combine --keep $(ls coverage-*/.coverage) coverage report -i rm -rf coverage-* ls -al - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} verbose: true flags: ${{ matrix.flag }} - name: Upload artifacts uses: actions/upload-artifact@v4 with: name: coverage-${{ matrix.flag }}-aggregated path: | .coverage include-hidden-files: true