Skip to content

Commit 7315729

Browse files
authored
[ci] migrate multi-node + correctness tests to nightly pipeline (deepjavalibrary#2662)
1 parent 2ae0aab commit 7315729

File tree

3 files changed

+28
-218
lines changed

3 files changed

+28
-218
lines changed

.github/workflows/correctness.yml

+8-117
Original file line numberDiff line numberDiff line change
@@ -10,123 +10,14 @@ on:
1010
schedule:
1111
- cron: '0 9 * * *'
1212

13+
# TODO: port this to integration tests in 0.31.0 and then delete this file
1314
jobs:
14-
create-runners:
15-
runs-on: [self-hosted, scheduler]
15+
fast-fail:
16+
runs-on: ubuntu-latest
1617
steps:
17-
- name: Create new G6 instance
18-
id: create_gpu1
18+
- name: Fail if run on master branch
19+
id: fast_fail
20+
if: github.ref == 'refs/heads/master'
1921
run: |
20-
cd /home/ubuntu/djl_benchmark_script/scripts
21-
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
22-
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
23-
--fail \
24-
| jq '.token' | tr -d '"' )
25-
./start_instance.sh action_g6 $token djl-serving
26-
- name: Create new G6 instance
27-
id: create_gpu2
28-
run: |
29-
cd /home/ubuntu/djl_benchmark_script/scripts
30-
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
31-
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
32-
--fail \
33-
| jq '.token' | tr -d '"' )
34-
./start_instance.sh action_g6 $token djl-serving
35-
- name: Create new Inf2.24xl instance
36-
id: create_inf2
37-
run: |
38-
cd /home/ubuntu/djl_benchmark_script/scripts
39-
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
40-
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
41-
--fail \
42-
| jq '.token' | tr -d '"' )
43-
./start_instance.sh action_inf2 $token djl-serving
44-
outputs:
45-
gpu_instance_id_1: ${{ steps.create_gpu1.outputs.action_g6_instance_id }}
46-
gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
47-
inf2_instance_id: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
48-
49-
test:
50-
runs-on: [ "${{ matrix.test.instance }}" ]
51-
timeout-minutes: 90
52-
needs: create-runners
53-
strategy:
54-
fail-fast: false
55-
matrix:
56-
test:
57-
- test: TestCorrectnessTrtLlm
58-
instance: g6
59-
- test: TestCorrectnessLmiDist
60-
instance: g6
61-
- test: TestCorrectnessNeuronx
62-
instance: inf2
63-
steps:
64-
- uses: actions/checkout@v4
65-
- name: Clean env
66-
run: |
67-
yes | docker system prune -a --volumes
68-
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
69-
echo "wait dpkg lock..."
70-
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
71-
- name: Set up JDK 17
72-
uses: actions/setup-java@v4
73-
with:
74-
distribution: 'corretto'
75-
java-version: 17
76-
- name: Set up Python3
77-
uses: actions/setup-python@v5
78-
with:
79-
python-version: '3.10.x'
80-
- name: Install pip dependencies
81-
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
82-
- name: Install torch
83-
# Use torch to get cuda capability of current device to selectively run tests
84-
# Torch version doesn't really matter that much
85-
run: |
86-
pip3 install torch==2.3.0
87-
- name: Install awscurl
88-
working-directory: tests/integration
89-
run: |
90-
curl -OL https://publish.djl.ai/awscurl/awscurl
91-
chmod +x awscurl
92-
mkdir outputs
93-
- name: Test
94-
working-directory: tests/integration
95-
env:
96-
TEST_DJL_VERSION: ${{ inputs.djl-version }}
97-
run: |
98-
python -m pytest -k ${{ matrix.test.test }} tests.py
99-
- name: Cleanup
100-
working-directory: tests/integration
101-
run: |
102-
rm -rf outputs
103-
rm awscurl
104-
- name: On Failure
105-
if: ${{ failure() }}
106-
working-directory: tests/integration
107-
run: |
108-
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
109-
sudo rm -rf outputs && sudo rm -rf models
110-
rm awscurl
111-
docker rm -f $(docker ps -aq) || true
112-
- name: Upload test logs
113-
if: ${{ always() }}
114-
uses: actions/upload-artifact@v4
115-
with:
116-
name: test-${{ matrix.test.test }}-logs
117-
path: tests/integration/all_logs/
118-
119-
stop-runners:
120-
if: always()
121-
runs-on: [ self-hosted, scheduler ]
122-
needs: [ create-runners, test]
123-
steps:
124-
- name: Stop all instances
125-
run: |
126-
cd /home/ubuntu/djl_benchmark_script/scripts
127-
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
128-
./stop_instance.sh $instance_id
129-
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
130-
./stop_instance.sh $instance_id
131-
instance_id=${{ needs.create-runners.outputs.inf2_instance_id }}
132-
./stop_instance.sh $instance_id
22+
echo "Fast fail"
23+
exit 1

.github/workflows/integration.yml

+12
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,18 @@ jobs:
197197
- test: TestLmiDistPipelineParallel
198198
instance: g6
199199
failure-prefix: lmi
200+
- test: TestLmiDistMultiNode
201+
instance: g6
202+
failure-prefix: lmi
203+
- test: TestCorrectnessTrtLlm
204+
instance: g6
205+
failure-prefix: trtllm
206+
- test: TestCorrectnessLmiDist
207+
instance: g6
208+
failure-prefix: lmi
209+
- test: TestCorrectnessNeuronx
210+
instance: inf2
211+
failure-prefix: neuron
200212
outputs:
201213
failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
202214
failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}

.github/workflows/multi_node_integration.yml

+8-101
Original file line numberDiff line numberDiff line change
@@ -7,108 +7,15 @@ on:
77
description: 'The released version of DJL'
88
required: false
99
default: ''
10-
schedule:
11-
- cron: '0 13 * * *'
12-
1310

11+
# TODO: port this to integration tests in 0.31.0 and then delete this file
1412
jobs:
15-
create-runners:
16-
runs-on: [self-hosted, scheduler]
17-
steps:
18-
- name: Create new G6 instance
19-
id: create_gpu
20-
run: |
21-
cd /home/ubuntu/djl_benchmark_script/scripts
22-
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
23-
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
24-
--fail \
25-
| jq '.token' | tr -d '"' )
26-
./start_instance.sh action_g6 $token djl-serving
27-
outputs:
28-
gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
29-
30-
multi-node-test:
31-
runs-on:
32-
- ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
33-
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
34-
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
35-
- ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
36-
- ${{ matrix.test.instance }}
37-
timeout-minutes: 60
38-
needs: create-runners
39-
strategy:
40-
fail-fast: false
41-
matrix:
42-
test:
43-
- test: TestLmiDistMultiNode
44-
instance: g6
45-
steps:
46-
- uses: actions/checkout@v4
47-
- name: Clean env
48-
run: |
49-
yes | docker system prune -a --volumes
50-
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
51-
echo "wait dpkg lock..."
52-
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
53-
- name: Set up Python3
54-
if: ${{ matrix.test.instance != 'aarch64' }}
55-
uses: actions/setup-python@v5
56-
with:
57-
python-version: '3.10.x'
58-
- name: Set up Python3 (aarch64)
59-
if: ${{ matrix.test.instance == 'aarch64' }}
60-
run: |
61-
# Using an alternate installation because of an incompatible combination
62-
# of aarch64 with ubuntu-20.04 not supported by the actions/setup-python
63-
sudo apt-get install python3 python-is-python3 python3-pip -y
64-
- name: Install pip dependencies
65-
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
66-
- name: Install torch
67-
# Use torch to get cuda capability of current device to selectively run tests
68-
# Torch version doesn't really matter that much
69-
run: |
70-
pip3 install torch==2.3.0
71-
- name: Install awscurl
72-
working-directory: tests/integration
73-
run: |
74-
wget https://publish.djl.ai/awscurl/awscurl
75-
chmod +x awscurl
76-
mkdir outputs
77-
- name: Test
78-
working-directory: tests/integration
79-
env:
80-
TEST_DJL_VERSION: ${{ inputs.djl-version }}
81-
run: |
82-
python -m pytest -k ${{ matrix.test.test }} tests.py
83-
- name: Cleanup
84-
working-directory: tests/integration
85-
run: |
86-
rm -rf outputs
87-
rm awscurl
88-
- name: On Failure
89-
if: ${{ failure() }}
90-
working-directory: tests/integration
91-
run: |
92-
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
93-
echo "Printing lmi worker log"
94-
cat all_logs/llama3-8b/lmi-worker.log
95-
sudo rm -rf outputs && sudo rm -rf models
96-
rm awscurl
97-
./remove_container.sh
98-
- name: Upload test logs
99-
if: ${{ always() }}
100-
uses: actions/upload-artifact@v4
101-
with:
102-
name: test-${{ matrix.test.test }}-logs
103-
path: tests/integration/all_logs/
104-
105-
stop-runners:
106-
if: always()
107-
runs-on: [ self-hosted, scheduler ]
108-
needs: [ create-runners, multi-node-test]
13+
fast-fail:
14+
runs-on: ubuntu-latest
10915
steps:
110-
- name: Stop all instances
16+
- name: Fail if run on master branch
17+
id: fast_fail
18+
if: github.ref == 'refs/heads/master'
11119
run: |
112-
cd /home/ubuntu/djl_benchmark_script/scripts
113-
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
114-
./stop_instance.sh $instance_id
20+
echo "Fast fail"
21+
exit 1

0 commit comments

Comments
 (0)