From ea787b08ff3c8167f5568a0634f385fa3212dab4 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 4 Feb 2025 23:17:11 +0000 Subject: [PATCH 1/9] Test build to check processing by different K8 queues. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 4 ++++ .buildkite/test-template.j2 | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a847a68a6ef71..a038fb592dacc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -93,6 +93,7 @@ steps: - label: Core Test # 10min mirror_hardwares: [amd] + amd_gpus: 4 # Just for the sake of queue testing fast_check: true source_file_dependencies: - vllm/core @@ -105,6 +106,7 @@ steps: working_dir: "/vllm-workspace/tests" fast_check: true mirror_hardwares: [amd] + amd_gpus: 2 # Just for the sake of queue testing source_file_dependencies: - vllm/ commands: @@ -257,6 +259,7 @@ steps: - label: LoRA Test %N # 15min each mirror_hardwares: [amd] + amd_gpus: 8 source_file_dependencies: - vllm/lora - tests/lora @@ -283,6 +286,7 @@ steps: - label: Kernels Test %N # 1h each mirror_hardwares: [amd] + amd_gpus: 8 source_file_dependencies: - csrc/ - vllm/attention diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index ce448836a8278..0d7c4ca9c75db 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -27,7 +27,15 @@ steps: depends_on: - "amd-build" agents: - queue: amd_gpu +{% if step.amd_gpus and step.amd_gpus==8%} + queue: amd_gpu_8 +{% elif step.amd_gpus and step.amd_gpus==4%} + queue: amd_gpu_4 +{% elif step.amd_gpus and step.amd_gpus==2%} + queue: amd_gpu_4 +{% else%} + queue: amd_gpu_1 +{% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: From 01dfddaa3bc466fae336d19dc39c42605eeb97d5 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 00:12:56 +0000 Subject: [PATCH 2/9] Testing. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 009e929ebace1..feda9b8dfaaf1 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG REMOTE_VLLM="0" +ARG REMOTE_VLLM="1" ARG USE_CYTHON="0" ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app From 7f80bf893fbb7c7332dbb48f8da2da6119f31644 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 00:33:23 +0000 Subject: [PATCH 3/9] Copying over the tests directory to enable CI testing. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index feda9b8dfaaf1..c28ffee094974 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG REMOTE_VLLM="1" +ARG REMOTE_VLLM="0" ARG USE_CYTHON="0" ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app @@ -108,6 +108,8 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 14aaf35a1871e0bea62d05ca7e7b2de199991c6a Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 05:06:38 +0000 Subject: [PATCH 4/9] Comparing with MI250 in the "mi250_8xGPU" queue. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 0d7c4ca9c75db..67bd8b5d15aec 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu_8 + queue: mi250_8xGPU {% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu_4 + queue: mi250_8xGPU {% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu_4 + queue: mi250_8xGPU {% else%} - queue: amd_gpu_1 + queue: mi250_8xGPU {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From a1064893a9eda82cf29f1181a04fe753dd47c58d Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 06:39:04 +0000 Subject: [PATCH 5/9] Building with "test" as a --target Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 10 +++++----- Dockerfile.rocm | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 67bd8b5d15aec..7106395910d3e 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" depends_on: ~ commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ." + - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain ." - "docker push {{ docker_image_amd }}" key: "amd-build" env: @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: mi250_8xGPU + queue: amd_gpu_8 {% elif step.amd_gpus and step.amd_gpus==4%} - queue: mi250_8xGPU + queue: amd_gpu_4 {% elif step.amd_gpus and step.amd_gpus==2%} - queue: mi250_8xGPU + queue: amd_gpu_4 {% else%} - queue: mi250_8xGPU + queue: amd_gpu_1 {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" diff --git a/Dockerfile.rocm b/Dockerfile.rocm index c28ffee094974..3965880bfd7c8 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,8 +108,8 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests - +#COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests +#COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 6acfc3aba4cbc7ad79ad9ed86315e39bc37ff065 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 08:04:00 +0000 Subject: [PATCH 6/9] Fixing working directory property. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a038fb592dacc..9df17920788d6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -92,6 +92,7 @@ steps: - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 4 # Just for the sake of queue testing fast_check: true @@ -178,6 +179,7 @@ steps: - pytest -v -s engine test_sequence.py test_config.py test_logger.py # OOM in the CI unless we run this separately - pytest -v -s tokenization + working_dir: "/vllm-workspace/tests" # optional - label: V1 Test #mirror_hardwares: [amd] @@ -219,6 +221,7 @@ steps: - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -237,6 +240,7 @@ steps: - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test # 5min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] source_file_dependencies: - vllm/model_executor/layers @@ -258,6 +262,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 8 source_file_dependencies: @@ -285,6 +290,7 @@ steps: - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 8 source_file_dependencies: @@ -296,6 +302,7 @@ steps: parallelism: 4 - label: Tensorizer Test # 11min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] soft_fail: true source_file_dependencies: @@ -338,6 +345,7 @@ steps: - pytest -v -s encoder_decoder - label: OpenAI-Compatible Tool Use # 20 min + working_dir: "/vllm-workspace/tests" fast_check: false mirror_hardwares: [ amd ] source_file_dependencies: From 172e0e8bd375d43ccfc41aa1d83f2d21256e78cf Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 18:17:17 +0000 Subject: [PATCH 7/9] Dummy alternation to confirm trouble with simultaneous test execution. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 3965880bfd7c8..edb042c68f691 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,8 +108,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -#COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests -#COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 95d444bbb5d5dc639ab1e35f2b384a928701bcfb Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Thu, 6 Feb 2025 16:58:36 +0000 Subject: [PATCH 8/9] Testing a single complete 8xGPU MI300 machine with CI tests. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 7106395910d3e..96a8903a946de 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu_8 + queue: amd_gpu {% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu_4 + queue: amd_gpu {% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu_4 + queue: amd_gpu {% else%} - queue: amd_gpu_1 + queue: amd_gpu {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From ca69176a5092e8382518662d27a8d55044ca96bc Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Thu, 6 Feb 2025 18:44:53 +0000 Subject: [PATCH 9/9] Changing the testing queue name to decouple from the default configuration. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 96a8903a946de..c0bd940e22de6 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu + queue: amd_8xgpu {% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu + queue: amd_8xgpu {% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu + queue: amd_8xgpu {% else%} - queue: amd_gpu + queue: amd_8xgpu {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"