nod-ai
diff --git a/‎.github/workflows/ci-llama-large-tests.yaml
+2-3 b/‎.github/workflows/ci-llama-large-tests.yaml
+2-3
diff --git a/‎.github/workflows/ci-llama-quick-tests.yaml
+2-3 b/‎.github/workflows/ci-llama-quick-tests.yaml
+2-3
diff --git a/‎.github/workflows/ci-sdxl.yaml
+1-1 b/‎.github/workflows/ci-sdxl.yaml
+1-1
diff --git a/‎.github/workflows/ci-sglang-benchmark.yml
+88 b/‎.github/workflows/ci-sglang-benchmark.yml
+88
diff --git a/‎.github/workflows/ci-shark-platform.yml
+3-4 b/‎.github/workflows/ci-shark-platform.yml
+3-4
diff --git a/‎.github/workflows/ci-sharktank.yml
+1-1 b/‎.github/workflows/ci-sharktank.yml
+1-1
diff --git a/‎.github/workflows/ci-tuner.yml
+4-1 b/‎.github/workflows/ci-tuner.yml
+4-1
diff --git a/‎.github/workflows/ci_eval.yaml
+2-3 b/‎.github/workflows/ci_eval.yaml
+2-3
diff --git a/‎.github/workflows/ci_linux_x64-libshortfin.yml
+1-1 b/‎.github/workflows/ci_linux_x64-libshortfin.yml
+1-1
diff --git a/‎.github/workflows/ci_linux_x64_asan-libshortfin.yml
+1-1 b/‎.github/workflows/ci_linux_x64_asan-libshortfin.yml
+1-1
diff --git a/‎.github/workflows/ci_linux_x64_nogil-libshortfin.yml
+1-1 b/‎.github/workflows/ci_linux_x64_nogil-libshortfin.yml
+1-1
diff --git a/‎.github/workflows/ci_windows_x64-libshortfin.yml
+1-1 b/‎.github/workflows/ci_windows_x64-libshortfin.yml
+1-1
diff --git a/‎README.md
+30-70 b/‎README.md
+30-70
diff --git a/‎app_tests/__init__.py b/‎app_tests/__init__.py
diff --git a/‎app_tests/benchmark_tests/__init__.py b/‎app_tests/benchmark_tests/__init__.py
diff --git a/‎app_tests/benchmark_tests/llm/conftest.py
+47 b/‎app_tests/benchmark_tests/llm/conftest.py
+47
@@ -70,9 +70,8 @@ jobs:
 
           # Test with pinned nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==2.9.0rc20241108 \
-            iree-base-runtime==2.9.0rc20241108 \
-            "numpy<2.0"
+            iree-base-compiler==3.0.0rc20241115 \
+            iree-base-runtime==3.0.0rc20241115
 
       - name: Run llama tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html
 
@@ -71,9 +71,8 @@ jobs:
 
           # Test with pinned nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==2.9.0rc20241108 \
-            iree-base-runtime==2.9.0rc20241108 \
-            "numpy<2.0"
+            iree-base-compiler==3.0.0rc20241115 \
+            iree-base-runtime==3.0.0rc20241115
 
       - name: Run llama 8b tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama
 
@@ -64,7 +64,7 @@ jobs:
         repository: iree-org/iree
         path: ${{ env.IREE_REPO_DIR }}
         submodules: false
-        ref: iree-2.9.0rc20241108
+        ref: iree-3.0.0rc20241115
 
     - name: Initalize IREE submodules
       working-directory: ${{ env.IREE_REPO_DIR }}
 
@@ -0,0 +1,88 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: SGLang Llama Benchmarking Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekdays at 4:00 AM UTC = 9:00 PM PST.
+    - cron: "0 4 * * 1-5"
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  sglang_bench_serve:
+    name: "SGLang Serving Benchmark Tests"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300x-3
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-base-compiler==3.0.0rc20241115 \
+            iree-base-runtime==3.0.0rc20241115 \
+            "numpy<2.0"
+
+      - name: Install SGLang
+        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+      - name: Launch Shortfin Server
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/llm/sglang
+          destination_dir: ./llm/sglang
+          keep_files: true
@@ -67,10 +67,9 @@ jobs:
           # Try with the latest IREE nightly releases, not what iree-turbine pins.
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
-            iree-base-runtime \
-            "numpy<2.0"
+            iree-base-runtime
 
       - name: Run LLM Integration Tests
-        run: pytest -v build_tools/integration_tests/llm --log-cli-level=INFO
+        run: pytest -v app_tests/integration_tests/llm --log-cli-level=INFO
@@ -62,7 +62,7 @@ jobs:
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
           # Update to the latest iree packages.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler iree-base-runtime --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
 
@@ -49,8 +49,11 @@ jobs:
           pip install -r tuner/requirements-tuner.txt
           python -m pip install \
             --find-links https://iree.dev/pip-release-links.html \
-            --upgrade \
+            --upgrade --pre \
             iree-base-compiler iree-base-runtime
 
       - name: Run tuner tests
         run: pytest tuner/
+
+      - name: Run mypy type checker
+        run: mypy tuner/tuner
@@ -69,10 +69,9 @@ jobs:
           # Try with the latest IREE nightly releases, not what iree-turbine pins.
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
-            iree-base-runtime \
-            "numpy<2.0"
+            iree-base-runtime
 
       - name: Run perplexity test with vmfb
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
 
@@ -59,7 +59,7 @@ jobs:
         repository: iree-org/iree
         path: ${{ env.IREE_REPO_DIR }}
         submodules: false
-        ref: iree-2.9.0rc20241108
+        ref: iree-3.0.0rc20241115
 
     - name: Initalize IREE submodules
       working-directory: ${{ env.IREE_REPO_DIR }}
 
@@ -109,7 +109,7 @@ jobs:
         repository: iree-org/iree
         path: ${{ env.IREE_SOURCE_DIR }}
         submodules: false
-        ref: iree-2.9.0rc20241108
+        ref: iree-3.0.0rc20241115
 
     - name: Initalize IREE submodules
       working-directory: ${{ env.IREE_SOURCE_DIR }}
 
@@ -57,7 +57,7 @@ jobs:
         repository: iree-org/iree
         path: ${{ env.IREE_REPO_DIR }}
         submodules: false
-        ref: iree-2.9.0rc20241108
+        ref: iree-3.0.0rc20241115
 
     - name: Initalize IREE submodules
       working-directory: ${{ env.IREE_REPO_DIR }}
 
@@ -54,7 +54,7 @@ jobs:
         repository: iree-org/iree
         path: ${{ env.IREE_REPO_DIR }}
         submodules: false
-        ref: iree-2.9.0rc20241108
+        ref: iree-3.0.0rc20241115
 
     - name: Initalize IREE submodules
       working-directory: ${{ env.IREE_REPO_DIR }}
 
@@ -1,22 +1,40 @@
 # SHARK Modeling and Serving Libraries
 
-**WARNING: This is an early preview that is in progress. It is not ready for
-general use.**
+> [!IMPORTANT]
+> Development is still in progress for several project components. See the
+> notes below for which workflows are best supported.
 
 ![GitHub License](https://img.shields.io/github/license/nod-ai/SHARK-Platform)
- [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
 
 <!-- TODO: high level overview, features when components are used together -->
 
 ## Sub-projects
 
+### [`shortfin/`](./shortfin/)
+
+<!-- TODO: features list here? -->
+
+[![PyPI version](https://badge.fury.io/py/shortfin.svg)](https://badge.fury.io/py/shortfin) [![CI - shortfin](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_linux_x64-libshortfin.yml/badge.svg?event=push)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_linux_x64-libshortfin.yml?query=event%3Apush)
+
+The shortfin sub-project is SHARK's high performance inference library and
+serving engine.
+
+* API documentation for shortfin is available on
+  [readthedocs](https://shortfin.readthedocs.io/en/latest/).
+
 ### [`sharktank/`](./sharktank/)
 
 [![PyPI version](https://badge.fury.io/py/sharktank.svg)](https://badge.fury.io/py/sharktank) [![CI - sharktank](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci-sharktank.yml/badge.svg?event=push)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci-sharktank.yml?query=event%3Apush)
 
 The SHARK Tank sub-project contains a collection of model recipes and
 conversion tools to produce inference-optimized programs.
 
+> [!WARNING]
+> SHARK Tank is still under development. Experienced users may want to try it
+> out, but we currently recommend most users download pre-exported or
+> pre-compiled model files for serving with shortfin.
+
 <!-- TODO: features list here? -->
 
 * See the [SHARK Tank Programming Guide](./docs/programming_guide.md) for
@@ -25,25 +43,18 @@ conversion tools to produce inference-optimized programs.
 * See [Direct Quantization with SHARK Tank](./docs/quantization.md)
   for information about quantization support.
 
-### [`shortfin/`](./shortfin/)
-
-<!-- TODO: features list here? -->
-
-[![PyPI version](https://badge.fury.io/py/shortfin.svg)](https://badge.fury.io/py/shortfin) [![CI - shortfin](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_linux_x64-libshortfin.yml/badge.svg?event=push)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_linux_x64-libshortfin.yml?query=event%3Apush)
-
-The shortfin sub-project is SHARK's high performance inference library and
-serving engine.
-
-* API documentation for shortfin is available on
-  [readthedocs](https://shortfin.readthedocs.io/en/latest/).
-
 ### [`tuner/`](./tuner/)
 
 [![CI - Tuner](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci-tuner.yml/badge.svg?event=push)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci-tuner.yml?query=event%3Apush)
 
 The Tuner sub-project assists with tuning program performance by searching for
 optimal parameter configurations to use during model compilation.
 
+> [!WARNING]
+> SHARK Tuner is still in early development. Interested users may want
+> to try it out, but the tuner is not ready for general use yet. Check out
+> [the readme](tuner/README.md) for more details.
+
 ## Support matrix
 
 <!-- TODO: version requirements for Python, ROCm, Linux, etc.  -->
@@ -55,62 +66,11 @@ Model name | Model recipes | Serving apps
 SDXL       | [`sharktank/sharktank/models/punet/`](https://github.com/nod-ai/SHARK-Platform/tree/main/sharktank/sharktank/models/punet) | [`shortfin/python/shortfin_apps/sd/`](https://github.com/nod-ai/SHARK-Platform/tree/main/shortfin/python/shortfin_apps/sd)
 llama      | [`sharktank/sharktank/models/llama/`](https://github.com/nod-ai/SHARK-Platform/tree/main/sharktank/sharktank/models/llama) | [`shortfin/python/shortfin_apps/llm/`](https://github.com/nod-ai/SHARK-Platform/tree/main/shortfin/python/shortfin_apps/llm)
 
-## Development getting started
-
-<!-- TODO: Remove or update this section. Common setup for all projects? -->
-
-Use this as a guide to get started developing the project using pinned,
-pre-release dependencies. You are welcome to deviate as you see fit, but
-these canonical directions mirror what the CI does.
-
-### Setup a venv
-
-We recommend setting up a virtual environment (venv). The project is configured
-to ignore `.venv` directories, and editors like VSCode pick them up by default.
-
-```
-python -m venv .venv
-source .venv/bin/activate
-```
-
-### Install PyTorch for your system
-
-If no explicit action is taken, the default PyTorch version will be installed.
-This will give you a current CUDA-based version. Install a different variant
-by doing so explicitly first:
-
-*CPU:*
-
-```
-pip install -r pytorch-cpu-requirements.txt
-```
-
-*ROCM:*
-
-```
-pip install -r pytorch-rocm-requirements.txt
-```
-
-### Install development packages
-
-```
-# Install editable local projects.
-pip install -r requirements.txt -e sharktank/ shortfin/
-
-# Optionally clone and install editable iree-turbine dep in deps/
-pip install -f https://iree.dev/pip-release-links.html --src deps \
-  -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-```
 
-### Running tests
+## SHARK Users
 
-```
-pytest sharktank
-pytest shortfin
-```
+If you're looking to use SHARK check out our [User Guide](docs/user_guide.md).
 
-### Optional: pre-commits and developer settings
+## SHARK Developers
 
-This project is set up to use the `pre-commit` tooling. To install it in
-your local repo, run: `pre-commit install`. After this point, when making
-commits locally, hooks will run. See https://pre-commit.com/
+If you're looking to develop SHARK, check out our [Developer Guide](docs/developer_guide.md).
@@ -0,0 +1,47 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import json
+import os
+import pytest
+import sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from integration_tests.llm.utils import compile_model, export_paged_llm_v1
+
+
+@pytest.fixture(scope="module")
+def pre_process_model(request, tmp_path_factory):
+    tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
+
+    model_path = request.param["model_path"]
+    settings = request.param["settings"]
+    batch_sizes = request.param["batch_sizes"]
+
+    tmp_dir = tmp_path_factory.mktemp("llm_benchmark_test")
+    mlir_path = tmp_dir / "model.mlir"
+    config_path = tmp_dir / "config.json"
+    vmfb_path = tmp_dir / "model.vmfb"
+
+    export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes)
+
+    config = {
+        "module_name": "module",
+        "module_abi_version": 1,
+        "max_seq_len": 131072,
+        "attn_head_count": 8,
+        "attn_head_dim": 128,
+        "prefill_batch_sizes": batch_sizes,
+        "decode_batch_sizes": batch_sizes,
+        "transformer_block_count": 32,
+        "paged_kv_cache": {"block_seq_stride": 16, "device_block_count": 256},
+    }
+    with open(config_path, "w") as file:
+        json.dump(config, file)
+
+    compile_model(mlir_path, vmfb_path, settings)
+
+    return tmp_dir