From 808511ec8cf6232b7363c0ffdc6be53e4807088c Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 19 Nov 2024 16:55:29 +0000
Subject: [PATCH 1/4] Implement sglang integration tests, Restructure
 app_tests/integration_tests, Add copyright headers to files in
 integration_tests that were missing it

---
 .../workflows/ci-sglang-integration-tests.yml |  87 +++++
 .github/workflows/ci-shark-ai.yml             |   2 +-
 app_tests/__init__.py                         |   5 +
 app_tests/benchmark_tests/__init__.py         |   5 +
 app_tests/integration_tests/__init__.py       |   5 +
 app_tests/integration_tests/llm/__init__.py   |   5 +
 .../integration_tests/llm/sglang/__init__.py  |   5 +
 .../integration_tests/llm/sglang/conftest.py  | 123 ++++++++
 .../llm/sglang/sglang_frontend_test.py        | 297 ++++++++++++++++++
 .../llm/shortfin/__init__.py                  |   5 +
 .../llm/{ => shortfin}/conftest.py            |   2 +-
 .../llm/{ => shortfin}/cpu_llm_server_test.py |   2 +-
 app_tests/integration_tests/llm/utils.py      |  26 ++
 13 files changed, 566 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/ci-sglang-integration-tests.yml
 create mode 100644 app_tests/integration_tests/llm/sglang/__init__.py
 create mode 100644 app_tests/integration_tests/llm/sglang/conftest.py
 create mode 100644 app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
 create mode 100644 app_tests/integration_tests/llm/shortfin/__init__.py
 rename app_tests/integration_tests/llm/{ => shortfin}/conftest.py (99%)
 rename app_tests/integration_tests/llm/{ => shortfin}/cpu_llm_server_test.py (97%)

diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
new file mode 100644
index 000000000..309dc8567
--- /dev/null
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -0,0 +1,87 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: SGLang Llama Integration Tests
+
+on:
+  workflow_dispatch:
+  # TODO: Remove after validating action
+  pull_request:
+  schedule:
+    # Run periodically, every 4 hours. This is ran periodically with the
+    # intent of catching regressions early, and allowing for those
+    # regressions to be easily triaged to a small subset of commits.
+    - cron: '0 */4 * * *'
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  sglang_bench_serve:
+    name: "SGLang Integration Tests"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300x-3
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-base-compiler==3.0.0rc20241115 \
+            iree-base-runtime==3.0.0rc20241115 \
+            "numpy<2.0"
+
+      - name: Install SGLang
+        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+      - name: Install sentence_transformers
+        run: pip install sentence_transformers
+
+      - name: Run Integration Tests
+        run: pytest -v app_tests/integration_tests/sglang --log-cli-level=INFO
diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml
index 28e2bc883..bf8007e65 100644
--- a/.github/workflows/ci-shark-ai.yml
+++ b/.github/workflows/ci-shark-ai.yml
@@ -72,4 +72,4 @@ jobs:
             iree-base-runtime
 
       - name: Run LLM Integration Tests
-        run: pytest -v app_tests/integration_tests/llm --log-cli-level=INFO
+        run: pytest -v app_tests/integration_tests/llm/shortfin --log-cli-level=INFO
diff --git a/app_tests/__init__.py b/app_tests/__init__.py
index e69de29bb..a85ba359d 100644
--- a/app_tests/__init__.py
+++ b/app_tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/benchmark_tests/__init__.py b/app_tests/benchmark_tests/__init__.py
index e69de29bb..a85ba359d 100644
--- a/app_tests/benchmark_tests/__init__.py
+++ b/app_tests/benchmark_tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/integration_tests/__init__.py b/app_tests/integration_tests/__init__.py
index e69de29bb..a85ba359d 100644
--- a/app_tests/integration_tests/__init__.py
+++ b/app_tests/integration_tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/integration_tests/llm/__init__.py b/app_tests/integration_tests/llm/__init__.py
index e69de29bb..a85ba359d 100644
--- a/app_tests/integration_tests/llm/__init__.py
+++ b/app_tests/integration_tests/llm/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/integration_tests/llm/sglang/__init__.py b/app_tests/integration_tests/llm/sglang/__init__.py
new file mode 100644
index 000000000..a85ba359d
--- /dev/null
+++ b/app_tests/integration_tests/llm/sglang/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/integration_tests/llm/sglang/conftest.py b/app_tests/integration_tests/llm/sglang/conftest.py
new file mode 100644
index 000000000..8543708da
--- /dev/null
+++ b/app_tests/integration_tests/llm/sglang/conftest.py
@@ -0,0 +1,123 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import json
+import logging
+import os
+import pytest
+
+from ..utils import (
+    find_available_port,
+    start_llm_server,
+    download_with_hf_datasets,
+    export_paged_llm_v1,
+    compile_model,
+)
+
+pytest.importorskip("sglang")
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
+
+pytest.importorskip("sentence_transformers")
+from sentence_transformers import SentenceTransformer
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope="module")
+def register_shortfin_backend(available_port):
+    backend = sgl.Shortfin(
+        chat_template=get_chat_template("llama-3-instruct"),
+        base_url=f"http://localhost:{available_port}",
+    )
+    sgl.set_default_backend(backend)
+
+
+@pytest.fixture(scope="module")
+def pre_process_model(request, tmp_path_factory):
+    device_settings = request.param["device_settings"]
+    tmp_dir = tmp_path_factory.mktemp("sglang_integration_tests")
+
+    # Download model
+    model_params_path = tmp_dir / "meta-llama-3.1-8b-instruct.f16.gguf"
+    download_with_hf_datasets(tmp_dir, "llama3_8B_fp16")
+
+    # Export to mlir
+    mlir_path = tmp_dir / "model.mlir"
+    config_path = tmp_dir / "config.json"
+    batch_sizes = [1, 4]
+    export_paged_llm_v1(
+        mlir_path,
+        config_path,
+        model_params_path,
+        batch_sizes,
+    )
+
+    # Compile Model
+    vmfb_path = tmp_dir / "model.vmfb"
+    compile_model(
+        mlir_path,
+        vmfb_path,
+        device_settings,
+    )
+
+    config = {
+        "module_name": "module",
+        "module_abi_version": 1,
+        "max_seq_len": 131072,
+        "attn_head_count": 8,
+        "attn_head_dim": 128,
+        "prefill_batch_sizes": [1, 4],
+        "decode_batch_sizes": [1, 4],
+        "transformer_block_count": 32,
+        "paged_kv_cache": {"block_seq_stride": 16, "device_block_count": 256},
+    }
+    config_path = tmp_dir / "config.json"
+    with open(config_path, "w") as f:
+        json.dump(config, f)
+
+    return tmp_dir
+
+
+@pytest.fixture(scope="module")
+def available_port():
+    return find_available_port()
+
+
+@pytest.fixture(scope="module")
+def start_server(request, pre_process_model, available_port):
+    os.environ["ROCR_VISIBLE_DEVICES"] = "1"
+    device_settings = request.param["device_settings"]
+
+    export_dir = pre_process_model
+
+    tokenizer_path = export_dir / "tokenizer.json"
+    model_params_path = export_dir / "meta-llama-3.1-8b-instruct.f16.gguf"
+    vmfb_path = export_dir / "model.vmfb"
+    config_path = export_dir / "config.json"
+
+    logger.info("Starting server...")
+    server_process = start_llm_server(
+        available_port,
+        tokenizer_path,
+        config_path,
+        vmfb_path,
+        model_params_path,
+        device_settings,
+        timeout=30,
+    )
+    logger.info("Server started")
+
+    yield server_process
+
+    server_process.terminate()
+    server_process.wait()
+
+
+@pytest.fixture(scope="module")
+def load_comparison_model():
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    return model
diff --git a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
new file mode 100644
index 000000000..07e7828f0
--- /dev/null
+++ b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
@@ -0,0 +1,297 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import re
+import pytest
+
+from ..utils import (
+    AccuracyValidationException,
+)
+
+pytest.importorskip("sglang")
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
+
+pytest.importorskip("sentence_transformers")
+from sentence_transformers import SentenceTransformer, util
+
+logger = logging.getLogger(__name__)
+
+DEVICE_SETTINGS = {
+    "device_flags": [
+        "--iree-hal-target-backends=rocm",
+        "--iree-hip-target=gfx942",
+    ],
+    "device": "hip",
+}
+
+ACCEPTED_THRESHOLD = 0.8
+
+
+def compute_similarity(model: SentenceTransformer, sentence_1: str, sentence_2: str):
+    embeddings = model.encode([sentence_1, sentence_2])
+    return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=50, temperature=1.0))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=50, temperature=1.0))
+
+
+@sgl.function
+def tip_suggestion(s):
+    s += (
+        "Here are two tips for staying healthy: "
+        "1. Balanced Diet. 2. Regular Exercise.\n\n"
+    )
+
+    forks = s.fork(2)
+    for i, f in enumerate(forks):
+        f += f"Now, expand tip {i+1} into a paragraph:\n"
+        f += sgl.gen(f"detailed_tip", max_tokens=50, temperature=1.0)
+
+    s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
+    s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
+    s += "In summary" + sgl.gen("summary")
+
+
+@pytest.mark.parametrize(
+    "pre_process_model,start_server",
+    [
+        (
+            {"device_settings": DEVICE_SETTINGS},
+            {"device_settings": DEVICE_SETTINGS},
+        )
+    ],
+    indirect=True,
+)
+def test_multi_turn_qa(load_comparison_model, start_server, register_shortfin_backend):
+    model = load_comparison_model
+
+    logger.debug("Starting Multi-Turn Question Test...")
+    state = multi_turn_question.run(
+        question_1="Name the capital city of the USA.",
+        question_2="The Smithsonian is in this location.",
+    )
+
+    logger.debug("Obtaining messages...")
+    messages = state.messages()
+    logger.debug("Messages Obtained...")
+
+    logger.debug("Checking first Q&A turn...")
+    assert messages[0] == {
+        "role": "user",
+        "content": "Name the capital city of the USA.",
+    }
+    assert messages[1]["role"] == "assistant"
+
+    ideal_answer = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
+    first_q_answer = messages[1]["content"]
+    logger.debug("Computing Similarity Score...")
+
+    score = compute_similarity(model, ideal_answer, first_q_answer)
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {first_q_answer}:\n SCORE: {score}"
+        )
+
+    assert messages[2] == {
+        "role": "user",
+        "content": "The Smithsonian is in this location.",
+    }
+    assert messages[3]["role"] == "assistant"
+
+    expected_answer = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
+    second_q_answer = messages[3]["content"]
+    score = compute_similarity(model, expected_answer, second_q_answer)
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {second_q_answer}:\n SCORE: {score}"
+        )
+
+
+@pytest.mark.parametrize(
+    "pre_process_model,start_server",
+    [
+        (
+            {"device_settings": DEVICE_SETTINGS},
+            {"device_settings": DEVICE_SETTINGS},
+        )
+    ],
+    indirect=True,
+)
+def test_stream_multi_turn_qa(
+    load_comparison_model, start_server, register_shortfin_backend
+):
+    def clean_message(message: str):
+        """Remove chat tags from message before comparison.
+
+        Args:
+            message (str): Message to clean.
+
+        Returns:
+            str: Message without tags (i.e. <|start_header_id|>)
+        """
+        pattern = r"<\|.*?\|>"
+        return re.sub(pattern, "", message)
+
+    model = load_comparison_model
+    question_1 = "Name the capital city of the USA."
+    question_2 = "The Smithsonian is in this location."
+    expected_answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
+    expected_answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
+
+    logger.debug("Starting Multi-Turn Question Test...")
+    state = multi_turn_question.run(
+        question_1=question_1,
+        question_2=question_2,
+        stream=True,
+    )
+
+    logger.debug("Obtaining messages...")
+    messages = ""
+    for chunk in state.text_iter():
+        messages += chunk
+    logger.debug("Messages Obtained...")
+    expected_result = f"user: {question_1}\nassistant: {expected_answer_1}\nuser: {question_2}\nassistant: {expected_answer_2}"
+    cleaned_messages = clean_message(messages)
+    score = compute_similarity(model, cleaned_messages, expected_result)
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_result} and {messages}:\n SCORE: {score}"
+        )
+
+    logger.debug(f"Stream Messages: {messages}")
+
+
+@pytest.mark.parametrize(
+    "pre_process_model,start_server",
+    [
+        (
+            {"device_settings": DEVICE_SETTINGS},
+            {"device_settings": DEVICE_SETTINGS},
+        )
+    ],
+    indirect=True,
+)
+def test_batch_multi_turn_qa(
+    load_comparison_model, start_server, register_shortfin_backend
+):
+    model = load_comparison_model
+
+    question_1_1 = "Name the capital city of the USA."
+    question_1_2 = "The Smithsonian is in this location."
+    expected_answer_1_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
+    expected_answer_1_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
+
+    question_2_1 = "Name the largest city in the USA."
+    question_2_2 = "The Empire State Building is in this location."
+    expected_answer_2_1 = "The largest city in the USA is New York City, with a population of over 8.4 million people, according to the United States Census Bureau (2020 estimates).assistant\n\nHowever, I should note that the largest city in the"
+    expected_answer_2_2 = "That's correct, the iconic Empire State Building is located in Midtown Manhattan, New York City. It's one of the most recognizable landmarks in the world and a symbol of the city's grandeur and history.assistant\n\nAnd, by"
+
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": question_1_1,
+                "question_2": question_1_2,
+            },
+            {
+                "question_1": question_2_1,
+                "question_2": question_2_2,
+            },
+        ]
+    )
+
+    first_qa = states[0]
+    second_qa = states[1]
+
+    first_messages = first_qa.messages()
+    second_messages = second_qa.messages()
+    assert first_messages[0] == {
+        "role": "user",
+        "content": question_1_1,
+    }
+
+    assert first_messages[1]["role"] == "assistant"
+    first_answer = first_messages[1]["content"]
+    expected_answer = expected_answer_1_1
+    score = compute_similarity(model, expected_answer, first_answer.lower())
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}"
+        )
+
+    assert first_messages[2] == {
+        "role": "user",
+        "content": question_1_2,
+    }
+    first_messages[3]["role"] = "assistant"
+    second_answer = first_messages[3]["content"]
+    expected_answer = expected_answer_1_2
+    score = compute_similarity(model, expected_answer, second_answer)
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}"
+        )
+
+    second_messages = second_qa.messages()
+    assert second_messages[0] == {
+        "role": "user",
+        "content": question_2_1,
+    }
+
+    assert second_messages[1]["role"] == "assistant"
+    first_answer = second_messages[1]["content"]
+    expected_answer = expected_answer_2_1
+    score = compute_similarity(model, expected_answer, first_answer.lower())
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}"
+        )
+
+    assert second_messages[2] == {
+        "role": "user",
+        "content": question_2_2,
+    }
+    second_messages[3]["role"] = "assistant"
+    second_answer = second_messages[3]["content"]
+    expected_answer = expected_answer_2_2
+    score = compute_similarity(model, expected_answer, second_answer)
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}"
+        )
+
+
+@pytest.mark.parametrize(
+    "pre_process_model,start_server",
+    [
+        (
+            {"device_settings": DEVICE_SETTINGS},
+            {"device_settings": DEVICE_SETTINGS},
+        )
+    ],
+    indirect=True,
+)
+def test_fork(load_comparison_model, start_server, register_shortfin_backend):
+    model = load_comparison_model
+
+    state = tip_suggestion.run()
+    result = state.text()
+    expected_answer = """Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise.
+    Tip 1:A balanced diet is essential for maintaining good health. It involves consuming a variety of foods from different food groups, including fruits, vegetables, whole grains, lean proteins, and healthy fats. A balanced diet provides the body with the necessary nutrients, vitamins, and
+    Tip 2:Regular exercise is essential for maintaining a healthy body. It helps to improve cardiovascular health, increase strength and flexibility, and boost the immune system. Regular physical activity can also reduce the risk of chronic diseases such as heart disease, diabetes, and certain types of cancer
+    In summary, a balanced diet and regular exercise are two of the most important tips for staying healthy. By following these tips, you can maintain a healthy body and reduce the risk of chronic diseases.
+    """
+    score = compute_similarity(model, result, expected_answer)
+    if not score > ACCEPTED_THRESHOLD:
+        raise AccuracyValidationException(
+            f"Accuracy error between {expected_answer} and {result}:\n SCORE: {score}"
+        )
diff --git a/app_tests/integration_tests/llm/shortfin/__init__.py b/app_tests/integration_tests/llm/shortfin/__init__.py
new file mode 100644
index 000000000..a85ba359d
--- /dev/null
+++ b/app_tests/integration_tests/llm/shortfin/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/integration_tests/llm/conftest.py b/app_tests/integration_tests/llm/shortfin/conftest.py
similarity index 99%
rename from app_tests/integration_tests/llm/conftest.py
rename to app_tests/integration_tests/llm/shortfin/conftest.py
index 5a5632405..0d40119c7 100644
--- a/app_tests/integration_tests/llm/conftest.py
+++ b/app_tests/integration_tests/llm/shortfin/conftest.py
@@ -12,7 +12,7 @@
 import shutil
 
 pytest.importorskip("transformers")
-from .utils import (
+from ..utils import (
     download_huggingface_model,
     download_tokenizer,
     export_paged_llm_v1,
diff --git a/app_tests/integration_tests/llm/cpu_llm_server_test.py b/app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py
similarity index 97%
rename from app_tests/integration_tests/llm/cpu_llm_server_test.py
rename to app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py
index 7eef10103..c4da9e4eb 100644
--- a/app_tests/integration_tests/llm/cpu_llm_server_test.py
+++ b/app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py
@@ -10,7 +10,7 @@
 import requests
 import uuid
 
-from .utils import AccuracyValidationException, start_log_group, end_log_group
+from ..utils import AccuracyValidationException, start_log_group, end_log_group
 
 logger = logging.getLogger(__name__)
 
diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py
index ba1ae7e5f..05712039e 100644
--- a/app_tests/integration_tests/llm/utils.py
+++ b/app_tests/integration_tests/llm/utils.py
@@ -7,6 +7,7 @@
 import logging
 import multiprocessing
 import os
+from pathlib import Path
 import subprocess
 import sys
 import time
@@ -36,6 +37,31 @@ def download_huggingface_model(local_dir, repo_id, model_file):
         logger.info("Using cached model")
 
 
+def download_with_hf_datasets(local_dir: Path | str, model_name: str):
+    """Download a model using `sharktank.utils.hf_datasets` script.
+
+    Args:
+        local_dir (Path | str): Local directory to download model to.
+        model_name (str): Name of model to download.
+    """
+    if isinstance(local_dir, Path):
+        local_dir = str(local_dir)
+
+    logger.info(f"Download model {model_name} with `hf_datasets` to {local_dir}...")
+    subprocess.run(
+        [
+            "python",
+            "-m",
+            "sharktank.utils.hf_datasets",
+            model_name,
+            "--local-dir",
+            local_dir,
+        ],
+        check=True,
+    )
+    logger.info(f"Model {model_name} successfully downloaded.")
+
+
 def download_tokenizer(local_dir, tokenizer_id):
     # Set up tokenizer if it doesn't exist
     tokenizer_path = local_dir / "tokenizer.json"

From 8cacfd16e811b56ba1c1168d98b3f050e05cbb36 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 19 Nov 2024 17:02:13 +0000
Subject: [PATCH 2/4] Don't pin `iree-base-compiler` and `iree-base-runtime`

---
 .github/workflows/ci-sglang-integration-tests.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index 309dc8567..ab3188a8a 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -69,12 +69,11 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
           pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
 
-          # Try with the latest nightly releases, not what iree-turbine pins.
-          # We could also pin to a known working or stable version.
-          # This should eventually stabilize. Do the best we can for now.
+          # Use newest possible releases to be able to track commits that may
+          # cause errors.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241115 \
-            iree-base-runtime==3.0.0rc20241115 \
+            iree-base-compiler \
+            iree-base-runtime \
             "numpy<2.0"
 
       - name: Install SGLang

From 5cac7180761d6e7811986a5cff0e65ff4168ad95 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 19 Nov 2024 17:05:19 +0000
Subject: [PATCH 3/4] Fix path to sglang integration tests

---
 .github/workflows/ci-sglang-integration-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index ab3188a8a..ee68ba322 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -83,4 +83,4 @@ jobs:
         run: pip install sentence_transformers
 
       - name: Run Integration Tests
-        run: pytest -v app_tests/integration_tests/sglang --log-cli-level=INFO
+        run: pytest -v app_tests/integration_tests/llm/sglang --log-cli-level=INFO

From 627af2d13a976a09862bcb0c9710dd93d30a28f1 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 19 Nov 2024 18:20:34 +0000
Subject: [PATCH 4/4] Remove PR trigger from workflow, Add more logging and a
 little cleanup in sglang_frontend_test

---
 .../workflows/ci-sglang-integration-tests.yml |  2 -
 .../llm/sglang/sglang_frontend_test.py        | 92 +++++++++++--------
 2 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index ee68ba322..1c382617d 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -8,8 +8,6 @@ name: SGLang Llama Integration Tests
 
 on:
   workflow_dispatch:
-  # TODO: Remove after validating action
-  pull_request:
   schedule:
     # Run periodically, every 4 hours. This is ran periodically with the
     # intent of catching regressions early, and allowing for those
diff --git a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
index 07e7828f0..efab14ea7 100644
--- a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
+++ b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
@@ -75,46 +75,49 @@ def tip_suggestion(s):
 def test_multi_turn_qa(load_comparison_model, start_server, register_shortfin_backend):
     model = load_comparison_model
 
-    logger.debug("Starting Multi-Turn Question Test...")
+    question_1 = "Name the capital city of the USA."
+    question_2 = "The Smithsonian is in this location."
+
+    answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
+    answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
+
+    logger.info("Testing multi-turn Q&A run...")
     state = multi_turn_question.run(
-        question_1="Name the capital city of the USA.",
-        question_2="The Smithsonian is in this location.",
+        question_1=question_1,
+        question_2=question_2,
     )
-
-    logger.debug("Obtaining messages...")
     messages = state.messages()
-    logger.debug("Messages Obtained...")
+    logger.info("Received messages from multi-turn call.")
 
-    logger.debug("Checking first Q&A turn...")
     assert messages[0] == {
         "role": "user",
-        "content": "Name the capital city of the USA.",
+        "content": question_1,
     }
     assert messages[1]["role"] == "assistant"
 
-    ideal_answer = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
+    logger.info("Computing similarity between first question and first answer...")
     first_q_answer = messages[1]["content"]
-    logger.debug("Computing Similarity Score...")
-
-    score = compute_similarity(model, ideal_answer, first_q_answer)
+    score = compute_similarity(model, answer_1, first_q_answer)
     if not score > ACCEPTED_THRESHOLD:
         raise AccuracyValidationException(
-            f"Accuracy error between {expected_answer} and {first_q_answer}:\n SCORE: {score}"
+            f"Accuracy error between {answer_1} and {first_q_answer}:\n SCORE: {score}"
         )
+    logger.info("Similarity passed")
 
     assert messages[2] == {
         "role": "user",
-        "content": "The Smithsonian is in this location.",
+        "content": question_2,
     }
     assert messages[3]["role"] == "assistant"
 
-    expected_answer = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
+    logger.info("Testing similarity between second question and second answer...")
     second_q_answer = messages[3]["content"]
-    score = compute_similarity(model, expected_answer, second_q_answer)
+    score = compute_similarity(model, answer_2, second_q_answer)
     if not score > ACCEPTED_THRESHOLD:
         raise AccuracyValidationException(
-            f"Accuracy error between {expected_answer} and {second_q_answer}:\n SCORE: {score}"
+            f"Accuracy error between {answer_2} and {second_q_answer}:\n SCORE: {score}"
         )
+    logger.info("Similarity passed.")
 
 
 @pytest.mark.parametrize(
@@ -148,18 +151,18 @@ def clean_message(message: str):
     expected_answer_1 = "The capital city of the United States of America is Washington, D.C. (short for District of Columbia).assistant\n\nWould you like to know more about Washington, D.C. or is there something else I can help you with?"
     expected_answer_2 = "The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. It was founded in 1846 and is named after British scientist James Smithson, who left a bequest to"
 
-    logger.debug("Starting Multi-Turn Question Test...")
+    logger.info("Testing multi-turn Q&A run w/ stream...")
     state = multi_turn_question.run(
         question_1=question_1,
         question_2=question_2,
         stream=True,
     )
-
-    logger.debug("Obtaining messages...")
     messages = ""
     for chunk in state.text_iter():
         messages += chunk
-    logger.debug("Messages Obtained...")
+    logger.info("Received messages from multi-turn call.")
+
+    logger.info("Computing similarity between expectation and result")
     expected_result = f"user: {question_1}\nassistant: {expected_answer_1}\nuser: {question_2}\nassistant: {expected_answer_2}"
     cleaned_messages = clean_message(messages)
     score = compute_similarity(model, cleaned_messages, expected_result)
@@ -167,8 +170,7 @@ def clean_message(message: str):
         raise AccuracyValidationException(
             f"Accuracy error between {expected_result} and {messages}:\n SCORE: {score}"
         )
-
-    logger.debug(f"Stream Messages: {messages}")
+    logger.info("Similarity passed.")
 
 
 @pytest.mark.parametrize(
@@ -196,6 +198,7 @@ def test_batch_multi_turn_qa(
     expected_answer_2_1 = "The largest city in the USA is New York City, with a population of over 8.4 million people, according to the United States Census Bureau (2020 estimates).assistant\n\nHowever, I should note that the largest city in the"
     expected_answer_2_2 = "That's correct, the iconic Empire State Building is located in Midtown Manhattan, New York City. It's one of the most recognizable landmarks in the world and a symbol of the city's grandeur and history.assistant\n\nAnd, by"
 
+    logger.info("Testing batch multi-turn Q&A run...")
     states = multi_turn_question.run_batch(
         [
             {
@@ -212,62 +215,66 @@ def test_batch_multi_turn_qa(
     first_qa = states[0]
     second_qa = states[1]
 
-    first_messages = first_qa.messages()
-    second_messages = second_qa.messages()
-    assert first_messages[0] == {
+    first_qa_messages = first_qa.messages()
+    second_qa_messages = second_qa.messages()
+
+    logger.info("Testing first batch of messages...")
+    assert first_qa_messages[0] == {
         "role": "user",
         "content": question_1_1,
     }
 
-    assert first_messages[1]["role"] == "assistant"
-    first_answer = first_messages[1]["content"]
+    assert first_qa_messages[1]["role"] == "assistant"
+    first_answer = first_qa_messages[1]["content"]
     expected_answer = expected_answer_1_1
-    score = compute_similarity(model, expected_answer, first_answer.lower())
+    score = compute_similarity(model, expected_answer, first_answer)
     if not score > ACCEPTED_THRESHOLD:
         raise AccuracyValidationException(
             f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}"
         )
 
-    assert first_messages[2] == {
+    assert first_qa_messages[2] == {
         "role": "user",
         "content": question_1_2,
     }
-    first_messages[3]["role"] = "assistant"
-    second_answer = first_messages[3]["content"]
+    first_qa_messages[3]["role"] = "assistant"
+    second_answer = first_qa_messages[3]["content"]
     expected_answer = expected_answer_1_2
     score = compute_similarity(model, expected_answer, second_answer)
     if not score > ACCEPTED_THRESHOLD:
         raise AccuracyValidationException(
             f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}"
         )
+    logger.info("First batch passed.")
 
-    second_messages = second_qa.messages()
-    assert second_messages[0] == {
+    logger.info("Testing second batch of messages...")
+    assert second_qa_messages[0] == {
         "role": "user",
         "content": question_2_1,
     }
 
-    assert second_messages[1]["role"] == "assistant"
-    first_answer = second_messages[1]["content"]
+    assert second_qa_messages[1]["role"] == "assistant"
+    first_answer = second_qa_messages[1]["content"]
     expected_answer = expected_answer_2_1
-    score = compute_similarity(model, expected_answer, first_answer.lower())
+    score = compute_similarity(model, expected_answer, first_answer)
     if not score > ACCEPTED_THRESHOLD:
         raise AccuracyValidationException(
             f"Accuracy error between {expected_answer} and {first_answer}:\n SCORE: {score}"
         )
 
-    assert second_messages[2] == {
+    assert second_qa_messages[2] == {
         "role": "user",
         "content": question_2_2,
     }
-    second_messages[3]["role"] = "assistant"
-    second_answer = second_messages[3]["content"]
+    second_qa_messages[3]["role"] = "assistant"
+    second_answer = second_qa_messages[3]["content"]
     expected_answer = expected_answer_2_2
     score = compute_similarity(model, expected_answer, second_answer)
     if not score > ACCEPTED_THRESHOLD:
         raise AccuracyValidationException(
             f"Accuracy error between {expected_answer} and {second_answer}:\n SCORE: {score}"
         )
+    logger.info("Second batch passed.")
 
 
 @pytest.mark.parametrize(
@@ -283,8 +290,12 @@ def test_batch_multi_turn_qa(
 def test_fork(load_comparison_model, start_server, register_shortfin_backend):
     model = load_comparison_model
 
+    logger.info("Testing fork...")
     state = tip_suggestion.run()
     result = state.text()
+    logger.info("Fork response received.")
+
+    logger.info("Computing similarity...")
     expected_answer = """Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise.
     Tip 1:A balanced diet is essential for maintaining good health. It involves consuming a variety of foods from different food groups, including fruits, vegetables, whole grains, lean proteins, and healthy fats. A balanced diet provides the body with the necessary nutrients, vitamins, and
     Tip 2:Regular exercise is essential for maintaining a healthy body. It helps to improve cardiovascular health, increase strength and flexibility, and boost the immune system. Regular physical activity can also reduce the risk of chronic diseases such as heart disease, diabetes, and certain types of cancer
@@ -295,3 +306,4 @@ def test_fork(load_comparison_model, start_server, register_shortfin_backend):
         raise AccuracyValidationException(
             f"Accuracy error between {expected_answer} and {result}:\n SCORE: {score}"
         )
+    logger.info("Similarity passed.")