[ci] migrate multi-node + correctness tests to nightly pipeline (deepjavalibrary#2662)

siddvenk · web-flow · commit 731572901948 · 2025-01-14T16:24:33.000-08:00
diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml
@@ -10,123 +10,14 @@ on:
   schedule:
     - cron: '0 9 * * *'
 
+# TODO: port this to integration tests in 0.31.0 and then delete this file
 jobs:
-  create-runners:
-    runs-on: [self-hosted, scheduler]
+  fast-fail:
+    runs-on: ubuntu-latest
     steps:
-      - name: Create new G6 instance
-        id: create_gpu1
+      - name: Fail if run on master branch
+        id: fast_fail
+        if: github.ref == 'refs/heads/master'
         run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_g6 $token djl-serving
-      - name: Create new G6 instance
-        id: create_gpu2
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_g6 $token djl-serving
-      - name: Create new Inf2.24xl instance
-        id: create_inf2
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_inf2 $token djl-serving
-    outputs:
-      gpu_instance_id_1: ${{ steps.create_gpu1.outputs.action_g6_instance_id }}
-      gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
-      inf2_instance_id: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
-
-  test:
-    runs-on: [ "${{ matrix.test.instance }}" ]
-    timeout-minutes: 90
-    needs: create-runners
-    strategy:
-      fail-fast: false
-      matrix:
-        test:
-          - test: TestCorrectnessTrtLlm
-            instance: g6
-          - test: TestCorrectnessLmiDist
-            instance: g6
-          - test: TestCorrectnessNeuronx
-            instance: inf2
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up JDK 17
-        uses: actions/setup-java@v4
-        with:
-          distribution: 'corretto'
-          java-version: 17
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
-      - name: Install torch
-        # Use torch to get cuda capability of current device to selectively run tests
-        # Torch version doesn't really matter that much
-        run: |
-          pip3 install torch==2.3.0
-      - name: Install awscurl
-        working-directory: tests/integration
-        run: |
-          curl -OL https://publish.djl.ai/awscurl/awscurl
-          chmod +x awscurl
-          mkdir outputs
-      - name: Test
-        working-directory: tests/integration
-        env:
-          TEST_DJL_VERSION: ${{ inputs.djl-version }}
-        run: |
-          python -m pytest -k ${{ matrix.test.test }} tests.py
-      - name: Cleanup
-        working-directory: tests/integration
-        run: |
-          rm -rf outputs
-          rm awscurl
-      - name: On Failure
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
-          sudo rm -rf outputs && sudo rm -rf models
-          rm awscurl
-          docker rm -f $(docker ps -aq) || true
-      - name: Upload test logs
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-${{ matrix.test.test }}-logs
-          path: tests/integration/all_logs/
-
-  stop-runners:
-    if: always()
-    runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, test]
-    steps:
-      - name: Stop all instances
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
-          ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
-          ./stop_instance.sh $instance_id
-          instance_id=${{ needs.create-runners.outputs.inf2_instance_id }}
-          ./stop_instance.sh $instance_id
+          echo "Fast fail"
+          exit 1
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -197,6 +197,18 @@ jobs:
           - test: TestLmiDistPipelineParallel
             instance: g6
             failure-prefix: lmi
+          - test: TestLmiDistMultiNode
+            instance: g6
+            failure-prefix: lmi
+          - test: TestCorrectnessTrtLlm
+            instance: g6
+            failure-prefix: trtllm
+          - test: TestCorrectnessLmiDist
+            instance: g6
+            failure-prefix: lmi
+          - test: TestCorrectnessNeuronx
+            instance: inf2
+            failure-prefix: neuron
     outputs:
       failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
       failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}
diff --git a/.github/workflows/multi_node_integration.yml b/.github/workflows/multi_node_integration.yml
@@ -7,108 +7,15 @@ on:
         description: 'The released version of DJL'
         required: false
         default: ''
-  schedule:
-    - cron: '0 13 * * *'
-
 
+# TODO: port this to integration tests in 0.31.0 and then delete this file
 jobs:
-  create-runners:
-    runs-on: [self-hosted, scheduler]
-    steps:
-      - name: Create new G6 instance
-        id: create_gpu
-        run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
-          --fail \
-          | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_g6 $token djl-serving
-    outputs:
-      gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
-
-  multi-node-test:
-    runs-on:
-      - ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
-      - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
-      - ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
-      - ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
-      - ${{ matrix.test.instance }}
-    timeout-minutes: 60
-    needs: create-runners
-    strategy:
-      fail-fast: false
-      matrix:
-        test:
-          - test: TestLmiDistMultiNode
-            instance: g6
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        if: ${{ matrix.test.instance != 'aarch64' }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Set up Python3 (aarch64)
-        if: ${{ matrix.test.instance == 'aarch64' }}
-        run: |
-          # Using an alternate installation because of an incompatible combination
-          # of aarch64 with ubuntu-20.04 not supported by the actions/setup-python
-          sudo apt-get install python3 python-is-python3 python3-pip -y
-      - name: Install pip dependencies
-        run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
-      - name: Install torch
-        # Use torch to get cuda capability of current device to selectively run tests
-        # Torch version doesn't really matter that much
-        run: |
-          pip3 install torch==2.3.0
-      - name: Install awscurl
-        working-directory: tests/integration
-        run: |
-          wget https://publish.djl.ai/awscurl/awscurl
-          chmod +x awscurl
-          mkdir outputs
-      - name: Test
-        working-directory: tests/integration
-        env:
-          TEST_DJL_VERSION: ${{ inputs.djl-version }}
-        run: |
-          python -m pytest -k ${{ matrix.test.test }} tests.py
-      - name: Cleanup
-        working-directory: tests/integration
-        run: |
-          rm -rf outputs
-          rm awscurl
-      - name: On Failure
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
-          echo "Printing lmi worker log"
-          cat all_logs/llama3-8b/lmi-worker.log
-          sudo rm -rf outputs && sudo rm -rf models
-          rm awscurl
-          ./remove_container.sh
-      - name: Upload test logs
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test-${{ matrix.test.test }}-logs
-          path: tests/integration/all_logs/
-
-  stop-runners:
-    if: always()
-    runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, multi-node-test]
+  fast-fail:
+    runs-on: ubuntu-latest
     steps:
-      - name: Stop all instances
+      - name: Fail if run on master branch
+        id: fast_fail
+        if: github.ref == 'refs/heads/master'
         run: |
-          cd /home/ubuntu/djl_benchmark_script/scripts
-          instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
-          ./stop_instance.sh $instance_id
+          echo "Fast fail"
+          exit 1