nod-ai · stbaione · Dec 4, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
@@ -21,9 +21,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  sglang_bench_serve:
+  benchmark_shortfin:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-    name: "SGLang Serving Benchmark Tests"
+    name: "SGLang Serving Benchmark With Shortfin"
     strategy:
       matrix:
         version: [3.11]
@@ -53,7 +53,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
@@ -69,21 +69,135 @@ jobs:
           # Try with the latest nightly releases, not what iree-turbine pins.
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241118 \
-            iree-base-runtime==3.0.0rc20241118 \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
+            iree-base-compiler \
+            iree-base-runtime \
             "numpy<2.0"
 
       - name: Install SGLang
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
-      - name: Launch Shortfin Server
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
+      - name: Run Shortfin Benchmark Tests
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/llm/shortfin
+          destination_dir: ./llm/sgl_benchmark/shortfin
+          keep_files: true
+
+  benchmark_sglang:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "SGLang Serving Benchmark With SGLang"
+    needs: benchmark_shortfin
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: mi300x-4
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
+            iree-base-compiler \
+            iree-base-runtime \
+            "numpy<2.0"
+
+      - name: Install SGLang
+        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # Instruction for SGLang image sourced from here:
+      #   https://sgl-project.github.io/start/install.html#method-3-using-docker
+      # We have to run in a docker container due to their vLLM dependency.
+      # From their pyproject.toml:
+      #   HIP (Heterogeneous-computing Interface for Portability) for AMD
+      #   => base docker rocm/vllm-dev:20241022, not from public vllm whl
+      #   srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
+      - name: Pull SGLang Image (Had issues with sglang:v0.3.5.post2-rocm620)
+        run: |
+          docker pull lmsysorg/sglang:v0.3.5.post1-rocm620
+
+      - name: Run SGLang Server
+        run: |
+          docker run --rm -d  \
+            --name=sglang-server \
+            --device=/dev/kfd \
+            --device=/dev/dri \
+            --ipc=host \
+            --shm-size 16G \
+            --group-add video \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            -v $HOME/dockerx:/dockerx \
+            -v /data:/data \
+            -p 30000:30000 \
+            -v ~/.cache/huggingface:/root/.cache/huggingface \
+            --env HF_TOKEN=${{ secrets.HF_TOKEN }} \
+            lmsysorg/sglang:v0.3.5.post1-rocm620 \
+            python3 -m sglang.launch_server \
+            --model-path meta-llama/Llama-3.1-8B-Instruct \
+            --host 0.0.0.0 \
+            --port 30000 \
+            --tp 1 \
+            --dtype float16 \
+            --disable-cuda-graph
+
+      - name: Run SGLang Benchmark Tests
+        run: |
+          pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --port 30000 --log-cli-level=INFO --html=out/llm/sglang/index.html
+
+      - name: Stop sglang-server
+        run: docker stop sglang-server || true # Stop container if it's running
+
+      - name: Cleanup SGLang Image
+        run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
           publish_dir: ./out/llm/sglang
-          destination_dir: ./llm/sglang
+          destination_dir: ./llm/sgl_benchmark/sglang
           keep_files: true
diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
@@ -54,7 +54,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
@@ -69,7 +69,7 @@ jobs:
 
           # Use newest possible releases to be able to track commits that may
           # cause errors.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
             iree-base-runtime \
             "numpy<2.0"

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -54,3 +54,17 @@ def pre_process_model(request, tmp_path_factory):
     compile_model(mlir_path, vmfb_path, settings)
 
     return tmp_dir
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--port",
+        action="store",
+        default="30000",
+        help="Port that SGLang server is running on",
+    )
+
+
+@pytest.fixture(scope="module")
+def sglang_args(request):
+    return request.config.getoption("--port")
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -5,9 +5,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import logging
-import multiprocessing
-import os
-from pathlib import Path
 import pytest
 import time
 from unittest.mock import patch
@@ -17,71 +14,32 @@
 
 from .utils import SGLangBenchmarkArgs, log_jsonl_result
 
-from integration_tests.llm.utils import (
-    find_available_port,
-    start_llm_server,
-)
+from integration_tests.llm.utils import wait_for_server, download_with_hf_datasets
 
 logger = logging.getLogger(__name__)
 
-device_settings = {
-    "device_flags": [
-        "--iree-hal-target-backends=rocm",
-        "--iree-hip-target=gfx942",
-    ],
-    "device": "hip",
-}
-
 
 @pytest.mark.parametrize(
-    "request_rate,model_param_file_name",
-    [
-        (req_rate, "meta-llama-3.1-8b-instruct.f16.gguf")
-        for req_rate in [1, 2, 4, 8, 16, 32]
-    ],
+    "request_rate,model_name",
+    [(req_rate, "llama3_8B_fp16") for req_rate in [1, 2, 4, 8, 16, 32]],
 )
-@pytest.mark.parametrize(
-    "pre_process_model",
-    [
-        (
-            {
-                "model_name": "llama3_8B_fp16",
-                "model_param_file_name": "meta-llama-3.1-8b-instruct.f16.gguf",
-                "settings": device_settings,
-                "batch_sizes": [1, 4],
-            }
-        )
-    ],
-    indirect=True,
-)
-def test_sglang_benchmark_server(
-    request_rate, model_param_file_name, pre_process_model
-):
-    # TODO: Remove when multi-device is fixed
-    os.environ["ROCR_VISIBLE_DEVICES"] = "1"
+def test_sglang_benchmark(request_rate, model_name, sglang_args, tmp_path_factory):
+    tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    tmp_dir = pre_process_model
+    # Download tokenizer for llama3_8B_fp16
+    download_with_hf_datasets(tmp_dir, model_name)
 
-    config_path = tmp_dir / "config.json"
-    vmfb_path = tmp_dir / "model.vmfb"
-    tokenizer_path = tmp_dir / "tokenizer.json"
-    model_path = tmp_dir / model_param_file_name
+    logger.info("Beginning SGLang benchmark test...")
 
-    # Start shortfin llm server
-    port = find_available_port()
-    server_process = start_llm_server(
-        port,
-        tokenizer_path,
-        config_path,
-        vmfb_path,
-        model_path,
-        device_settings,
-        timeout=30,
-    )
+    port = sglang_args
+    base_url = f"http://localhost:{port}"
+
+    # Setting a high timeout gives enough time for downloading model artifacts
+    # and starting up server... Takes a little longer than shortfin.
+    wait_for_server(base_url, timeout=600)
 
-    # Run and collect SGLang Serving Benchmark
     benchmark_args = SGLangBenchmarkArgs(
-        backend="shortfin",
+        backend="sglang",
         num_prompt=10,
         base_url=f"http://localhost:{port}",
         tokenizer=tmp_dir,
@@ -95,21 +53,15 @@ def test_sglang_benchmark_server(
 
     logger.info("Running SGLang Benchmark with the following args:")
     logger.info(benchmark_args)
+
     try:
         start = time.time()
         with patch.object(bench_serving, "print", side_effect=logger.info):
-            benchmark_process = multiprocessing.Process(
-                target=bench_serving.run_benchmark,
-                args=(benchmark_args.as_namespace(),),
+            bench_serving.run_benchmark(
+                benchmark_args.as_namespace(),
             )
-            benchmark_process.start()
-            benchmark_process.join()
-
         logger.info(f"Benchmark run completed in {str(time.time() - start)} seconds")
         logger.info("======== RESULTS ========")
         log_jsonl_result(benchmark_args.output_file)
     except Exception as e:
         logger.error(e)
-
-    server_process.terminate()
-    server_process.wait()