diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml new file mode 100644 index 000000000..1c382617d --- /dev/null +++ b/.github/workflows/ci-sglang-integration-tests.yml @@ -0,0 +1,84 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +name: SGLang Llama Integration Tests + +on: + workflow_dispatch: + schedule: + # Run periodically, every 4 hours. This is ran periodically with the + # intent of catching regressions early, and allowing for those + # regressions to be easily triaged to a small subset of commits. + - cron: '0 */4 * * *' + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + sglang_bench_serve: + name: "SGLang Integration Tests" + strategy: + matrix: + version: [3.11] + fail-fast: false + runs-on: llama-mi300x-3 + defaults: + run: + shell: bash + env: + PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache" + steps: + - name: Get Current Date + id: date + run: echo "::set-output name=date::$(date +'%Y-%m-%d')" + + - name: "Setting up Python" + id: setup_python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{matrix.version}} + + - name: "Checkout Code" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Cache Pip Packages + uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + id: cache-pip + with: + path: ${{ env.PIP_CACHE_DIR }} + key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }} + + - name: Install pip deps + run: | + python -m pip install --no-compile --upgrade pip + # Note: We install in three steps in order to satisfy requirements + # from non default locations first. Installing the PyTorch CPU + # wheels saves multiple minutes and a lot of bandwidth on runner setup. + pip install --no-compile -r pytorch-cpu-requirements.txt + pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ + -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" + pip install --no-compile -r requirements.txt -e sharktank/ shortfin/ + + # Use newest possible releases to be able to track commits that may + # cause errors. + pip install -f https://iree.dev/pip-release-links.html --upgrade \ + iree-base-compiler \ + iree-base-runtime \ + "numpy<2.0" + + - name: Install SGLang + run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python" + + - name: Install sentence_transformers + run: pip install sentence_transformers + + - name: Run Integration Tests + run: pytest -v app_tests/integration_tests/llm/sglang --log-cli-level=INFO diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml index 28e2bc883..bf8007e65 100644 --- a/.github/workflows/ci-shark-ai.yml +++ b/.github/workflows/ci-shark-ai.yml @@ -72,4 +72,4 @@ jobs: iree-base-runtime - name: Run LLM Integration Tests - run: pytest -v app_tests/integration_tests/llm --log-cli-level=INFO + run: pytest -v app_tests/integration_tests/llm/shortfin --log-cli-level=INFO diff --git a/app_tests/__init__.py b/app_tests/__init__.py index e69de29bb..a85ba359d 100644 --- a/app_tests/__init__.py +++ b/app_tests/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/app_tests/benchmark_tests/__init__.py b/app_tests/benchmark_tests/__init__.py index e69de29bb..a85ba359d 100644 --- a/app_tests/benchmark_tests/__init__.py +++ b/app_tests/benchmark_tests/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/app_tests/integration_tests/__init__.py b/app_tests/integration_tests/__init__.py index e69de29bb..a85ba359d 100644 --- a/app_tests/integration_tests/__init__.py +++ b/app_tests/integration_tests/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/app_tests/integration_tests/llm/__init__.py b/app_tests/integration_tests/llm/__init__.py index e69de29bb..a85ba359d 100644 --- a/app_tests/integration_tests/llm/__init__.py +++ b/app_tests/integration_tests/llm/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/app_tests/integration_tests/llm/sglang/__init__.py b/app_tests/integration_tests/llm/sglang/__init__.py new file mode 100644 index 000000000..a85ba359d --- /dev/null +++ b/app_tests/integration_tests/llm/sglang/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/app_tests/integration_tests/llm/sglang/conftest.py b/app_tests/integration_tests/llm/sglang/conftest.py new file mode 100644 index 000000000..8543708da --- /dev/null +++ b/app_tests/integration_tests/llm/sglang/conftest.py @@ -0,0 +1,123 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import json +import logging +import os +import pytest + +from ..utils import ( + find_available_port, + start_llm_server, + download_with_hf_datasets, + export_paged_llm_v1, + compile_model, +) + +pytest.importorskip("sglang") +import sglang as sgl +from sglang.lang.chat_template import get_chat_template + +pytest.importorskip("sentence_transformers") +from sentence_transformers import SentenceTransformer + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="module") +def register_shortfin_backend(available_port): + backend = sgl.Shortfin( + chat_template=get_chat_template("llama-3-instruct"), + base_url=f"http://localhost:{available_port}", + ) + sgl.set_default_backend(backend) + + +@pytest.fixture(scope="module") +def pre_process_model(request, tmp_path_factory): + device_settings = request.param["device_settings"] + tmp_dir = tmp_path_factory.mktemp("sglang_integration_tests") + + # Download model + model_params_path = tmp_dir / "meta-llama-3.1-8b-instruct.f16.gguf" + download_with_hf_datasets(tmp_dir, "llama3_8B_fp16") + + # Export to mlir + mlir_path = tmp_dir / "model.mlir" + config_path = tmp_dir / "config.json" + batch_sizes = [1, 4] + export_paged_llm_v1( + mlir_path, + config_path, + model_params_path, + batch_sizes, + ) + + # Compile Model + vmfb_path = tmp_dir / "model.vmfb" + compile_model( + mlir_path, + vmfb_path, + device_settings, + ) + + config = { + "module_name": "module", + "module_abi_version": 1, + "max_seq_len": 131072, + "attn_head_count": 8, + "attn_head_dim": 128, + "prefill_batch_sizes": [1, 4], + "decode_batch_sizes": [1, 4], + "transformer_block_count": 32, + "paged_kv_cache": {"block_seq_stride": 16, "device_block_count": 256}, + } + config_path = tmp_dir / "config.json" + with open(config_path, "w") as f: + json.dump(config, f) + + return tmp_dir + + +@pytest.fixture(scope="module") +def available_port(): + return find_available_port() + + +@pytest.fixture(scope="module") +def start_server(request, pre_process_model, available_port): + os.environ["ROCR_VISIBLE_DEVICES"] = "1" + device_settings = request.param["device_settings"] + + export_dir = pre_process_model + + tokenizer_path = export_dir / "tokenizer.json" + model_params_path = export_dir / "meta-llama-3.1-8b-instruct.f16.gguf" + vmfb_path = export_dir / "model.vmfb" + config_path = export_dir / "config.json" + + logger.info("Starting server...") + server_process = start_llm_server( + available_port, + tokenizer_path, + config_path, + vmfb_path, + model_params_path, + device_settings, + timeout=30, + ) + logger.info("Server started") + + yield server_process + + server_process.terminate() + server_process.wait() + + +@pytest.fixture(scope="module") +def load_comparison_model(): + model = SentenceTransformer("all-MiniLM-L6-v2") + return model diff --git a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py new file mode 100644 index 000000000..efab14ea7 --- /dev/null +++ b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py @@ -0,0 +1,309 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import logging +import re +import pytest + +from ..utils import ( + AccuracyValidationException, +) + +pytest.importorskip("sglang") +import sglang as sgl +from sglang.lang.chat_template import get_chat_template + +pytest.importorskip("sentence_transformers") +from sentence_transformers import SentenceTransformer, util + +logger = logging.getLogger(__name__) + +DEVICE_SETTINGS = { + "device_flags": [ + "--iree-hal-target-backends=rocm", + "--iree-hip-target=gfx942", + ], + "device": "hip", +} + +ACCEPTED_THRESHOLD = 0.8 + + +def compute_similarity(model: SentenceTransformer, sentence_1: str, sentence_2: str): + embeddings = model.encode([sentence_1, sentence_2]) + return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item() + + +@sgl.function +def multi_turn_question(s, question_1, question_2): + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=50, temperature=1.0)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=50, temperature=1.0)) + + +@sgl.function +def tip_suggestion(s): + s += ( + "Here are two tips for staying healthy: " + "1. Balanced Diet. 2. Regular Exercise.\n\n" + ) + + forks = s.fork(2) + for i, f in enumerate(forks): + f += f"Now, expand tip {i+1} into a paragraph:\n" + f += sgl.gen(f"detailed_tip", max_tokens=50, temperature=1.0) + + s += "Tip 1:" + forks[0]["detailed_tip"] + "\n" + s += "Tip 2:" + forks[1]["detailed_tip"] + "\n" + s += "In summary" + sgl.gen("summary") + + +@pytest.mark.parametrize( + "pre_process_model,start_server", + [ + ( + {"device_settings": DEVICE_SETTINGS}, + {"device_settings": DEVICE_SETTINGS}, + ) + ], + indirect=True, +) +def test_multi_turn_qa(load_comparison_model, start_server, register_shortfin_backend): + model = load_comparison_model + + question_1 = "Name the capital city of the USA." + question_2 = "The Smithsonian is in this location." + + answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?" + answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to" + + logger.info("Testing multi-turn Q&A run...") + state = multi_turn_question.run( + question_1=question_1, + question_2=question_2, + ) + messages = state.messages() + logger.info("Received messages from multi-turn call.") + + assert messages[0] == { + "role": "user", + "content": question_1, + } + assert messages[1]["role"] == "assistant" + + logger.info("Computing similarity between first question and first answer...") + first_q_answer = messages[1]["content"] + score = compute_similarity(model, answer_1, first_q_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {answer_1} and {first_q_answer}:\n SCORE: {score}" + ) + logger.info("Similarity passed") + + assert messages[2] == { + "role": "user", + "content": question_2, + } + assert messages[3]["role"] == "assistant" + + logger.info("Testing similarity between second question and second answer...") + second_q_answer = messages[3]["content"] + score = compute_similarity(model, answer_2, second_q_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {answer_2} and {second_q_answer}:\n SCORE: {score}" + ) + logger.info("Similarity passed.") + + +@pytest.mark.parametrize( + "pre_process_model,start_server", + [ + ( + {"device_settings": DEVICE_SETTINGS}, + {"device_settings": DEVICE_SETTINGS}, + ) + ], + indirect=True, +) +def test_stream_multi_turn_qa( + load_comparison_model, start_server, register_shortfin_backend +): + def clean_message(message: str): + """Remove chat tags from message before comparison. + + Args: + message (str): Message to clean. + + Returns: + str: Message without tags (i.e. <|start_header_id|>) + """ + pattern = r"<\|.*?\|>" + return re.sub(pattern, "", message) + + model = load_comparison_model + question_1 = "Name the capital city of the USA." + question_2 = "The Smithsonian is in this location." + expected_answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?" + expected_answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to" + + logger.info("Testing multi-turn Q&A run w/ stream...") + state = multi_turn_question.run( + question_1=question_1, + question_2=question_2, + stream=True, + ) + messages = "" + for chunk in state.text_iter(): + messages += chunk + logger.info("Received messages from multi-turn call.") + + logger.info("Computing similarity between expectation and result") + expected_result = f"user: {question_1}\nassistant: {expected_answer_1}\nuser: {question_2}\nassistant: {expected_answer_2}" + cleaned_messages = clean_message(messages) + score = compute_similarity(model, cleaned_messages, expected_result) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {expected_result} and {messages}:\n SCORE: {score}" + ) + logger.info("Similarity passed.") + + +@pytest.mark.parametrize( + "pre_process_model,start_server", + [ + ( + {"device_settings": DEVICE_SETTINGS}, + {"device_settings": DEVICE_SETTINGS}, + ) + ], + indirect=True, +) +def test_batch_multi_turn_qa( + load_comparison_model, start_server, register_shortfin_backend +): + model = load_comparison_model + + question_1_1 = "Name the capital city of the USA." + question_1_2 = "The Smithsonian is in this location." + expected_answer_1_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?" + expected_answer_1_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to" + + question_2_1 = "Name the largest city in the USA." + question_2_2 = "The Empire State Building is in this location." + expected_answer_2_1 = "The largest city in the USA is New York City, with a population of over 8.4 million people, according to the United States Census Bureau (2020 estimates).assistant\n\nHowever, I should note that the largest city in the" + expected_answer_2_2 = "That's correct, the iconic Empire State Building is located in Midtown Manhattan, New York City. It's one of the most recognizable landmarks in the world and a symbol of the city's grandeur and history.assistant\n\nAnd, by" + + logger.info("Testing batch multi-turn Q&A run...") + states = multi_turn_question.run_batch( + [ + { + "question_1": question_1_1, + "question_2": question_1_2, + }, + { + "question_1": question_2_1, + "question_2": question_2_2, + }, + ] + ) + + first_qa = states[0] + second_qa = states[1] + + first_qa_messages = first_qa.messages() + second_qa_messages = second_qa.messages() + + logger.info("Testing first batch of messages...") + assert first_qa_messages[0] == { + "role": "user", + "content": question_1_1, + } + + assert first_qa_messages[1]["role"] == "assistant" + first_answer = first_qa_messages[1]["content"] + expected_answer = expected_answer_1_1 + score = compute_similarity(model, expected_answer, first_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}" + ) + + assert first_qa_messages[2] == { + "role": "user", + "content": question_1_2, + } + first_qa_messages[3]["role"] = "assistant" + second_answer = first_qa_messages[3]["content"] + expected_answer = expected_answer_1_2 + score = compute_similarity(model, expected_answer, second_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}" + ) + logger.info("First batch passed.") + + logger.info("Testing second batch of messages...") + assert second_qa_messages[0] == { + "role": "user", + "content": question_2_1, + } + + assert second_qa_messages[1]["role"] == "assistant" + first_answer = second_qa_messages[1]["content"] + expected_answer = expected_answer_2_1 + score = compute_similarity(model, expected_answer, first_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}" + ) + + assert second_qa_messages[2] == { + "role": "user", + "content": question_2_2, + } + second_qa_messages[3]["role"] = "assistant" + second_answer = second_qa_messages[3]["content"] + expected_answer = expected_answer_2_2 + score = compute_similarity(model, expected_answer, second_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}" + ) + logger.info("Second batch passed.") + + +@pytest.mark.parametrize( + "pre_process_model,start_server", + [ + ( + {"device_settings": DEVICE_SETTINGS}, + {"device_settings": DEVICE_SETTINGS}, + ) + ], + indirect=True, +) +def test_fork(load_comparison_model, start_server, register_shortfin_backend): + model = load_comparison_model + + logger.info("Testing fork...") + state = tip_suggestion.run() + result = state.text() + logger.info("Fork response received.") + + logger.info("Computing similarity...") + expected_answer = """Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise. + Tip 1:A balanced diet is essential for maintaining good health. It involves consuming a variety of foods from different food groups, including fruits, vegetables, whole grains, lean proteins, and healthy fats. A balanced diet provides the body with the necessary nutrients, vitamins, and + Tip 2:Regular exercise is essential for maintaining a healthy body. It helps to improve cardiovascular health, increase strength and flexibility, and boost the immune system. Regular physical activity can also reduce the risk of chronic diseases such as heart disease, diabetes, and certain types of cancer + In summary, a balanced diet and regular exercise are two of the most important tips for staying healthy. By following these tips, you can maintain a healthy body and reduce the risk of chronic diseases. + """ + score = compute_similarity(model, result, expected_answer) + if not score > ACCEPTED_THRESHOLD: + raise AccuracyValidationException( + f"Accuracy error between {expected_answer} and {result}:\n SCORE: {score}" + ) + logger.info("Similarity passed.") diff --git a/app_tests/integration_tests/llm/shortfin/__init__.py b/app_tests/integration_tests/llm/shortfin/__init__.py new file mode 100644 index 000000000..a85ba359d --- /dev/null +++ b/app_tests/integration_tests/llm/shortfin/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/app_tests/integration_tests/llm/conftest.py b/app_tests/integration_tests/llm/shortfin/conftest.py similarity index 99% rename from app_tests/integration_tests/llm/conftest.py rename to app_tests/integration_tests/llm/shortfin/conftest.py index 5a5632405..0d40119c7 100644 --- a/app_tests/integration_tests/llm/conftest.py +++ b/app_tests/integration_tests/llm/shortfin/conftest.py @@ -12,7 +12,7 @@ import shutil pytest.importorskip("transformers") -from .utils import ( +from ..utils import ( download_huggingface_model, download_tokenizer, export_paged_llm_v1, diff --git a/app_tests/integration_tests/llm/cpu_llm_server_test.py b/app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py similarity index 97% rename from app_tests/integration_tests/llm/cpu_llm_server_test.py rename to app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py index 7eef10103..c4da9e4eb 100644 --- a/app_tests/integration_tests/llm/cpu_llm_server_test.py +++ b/app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py @@ -10,7 +10,7 @@ import requests import uuid -from .utils import AccuracyValidationException, start_log_group, end_log_group +from ..utils import AccuracyValidationException, start_log_group, end_log_group logger = logging.getLogger(__name__) diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py index ba1ae7e5f..05712039e 100644 --- a/app_tests/integration_tests/llm/utils.py +++ b/app_tests/integration_tests/llm/utils.py @@ -7,6 +7,7 @@ import logging import multiprocessing import os +from pathlib import Path import subprocess import sys import time @@ -36,6 +37,31 @@ def download_huggingface_model(local_dir, repo_id, model_file): logger.info("Using cached model") +def download_with_hf_datasets(local_dir: Path | str, model_name: str): + """Download a model using `sharktank.utils.hf_datasets` script. + + Args: + local_dir (Path | str): Local directory to download model to. + model_name (str): Name of model to download. + """ + if isinstance(local_dir, Path): + local_dir = str(local_dir) + + logger.info(f"Download model {model_name} with `hf_datasets` to {local_dir}...") + subprocess.run( + [ + "python", + "-m", + "sharktank.utils.hf_datasets", + model_name, + "--local-dir", + local_dir, + ], + check=True, + ) + logger.info(f"Model {model_name} successfully downloaded.") + + def download_tokenizer(local_dir, tokenizer_id): # Set up tokenizer if it doesn't exist tokenizer_path = local_dir / "tokenizer.json"