nod-ai · sogartar · Nov 20, 2024 · Nov 12, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/.github/workflows/ci-sharktank.yml b/.github/workflows/ci-sharktank.yml
@@ -70,3 +70,62 @@ jobs:
         if: ${{ !cancelled() }}
         run: |
           pytest -n 4 sharktank/
+
+
+  test_with_data:
+    name: "Data-dependent Tests"
+    strategy:
+      matrix:
+        version: [3.11]
+        runs-on: [llama-mi300x-3]
+      fail-fast: false
+    runs-on: ${{matrix.runs-on}}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+      HF_HOME: "/data/huggingface"
+      SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','sharktank/requirements*.txt') }}
+
+      - name: Install sharktank deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+
+          # Install latest iree-tubrine.
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+
+          # Try with the latest IREE nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
+            iree-base-compiler \
+            iree-base-runtime
+
+      - name: Run tests
+        run: |
+          pytest \
+            --with-t5-data \
+            sharktank/tests/models/t5/t5_test.py
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
@@ -88,6 +88,16 @@ def pytest_addoption(parser):
         help="Enable all llama benchmarking tests",
     )
 
+    parser.addoption(
+        "--with-t5-data",
+        action="store_true",
+        default=False,
+        help=(
+            "Enable tests that use T5 data like models that is not a part of the source "
+            "code. The user is expected to provide the data"
+        ),
+    )
+
     # TODO: Remove all hardcoded paths in CI tests
     parser.addoption(
         "--llama3-8b-tokenizer-path",
@@ -133,6 +143,28 @@ def pytest_addoption(parser):
         help="Llama3.1 405b fp8 model path",
     )
 
+    # To obtain a T5 GGUF file you can use llama.cpp's convert_hf_to_gguf.py.
+    # https://github.com/ggerganov/llama.cpp/blob/9abe9eeae98b11fa93b82632b264126a010225ff/convert_hf_to_gguf.py
+    # E.g.
+    # git lfs install
+    # git clone https://huggingface.co/google/t5-v1_1-small
+    # convert_hf_to_gguf.py \
+    #     --outfile t5-v1_1-small.gguf \
+    #     --outtype=f32 \
+    #     t5-v1_1-small
+    parser.addoption(
+        "--google-t5-v1-1-small-fp32-model-path",
+        type=Path,
+        default="/data/t5/small/google__t5-v1_1-small_fp32.gguf",
+        help="Google T5 v1.1 small fp32 model path",
+    )
+    parser.addoption(
+        "--google-t5-v1-1-xxl-fp32-model-path",
+        type=Path,
+        default="/data/t5/xxl/google__t5-v1_1-xxl_fp32.gguf",
+        help="Google T5 v1.1 XXL fp32 model path",
+    )
+
     parser.addoption(
         "--baseline-perplexity-scores",
         type=Path,
@@ -256,6 +288,16 @@ def get_model_artifacts(request: FixtureRequest):
     model_path["llama3_405b_fp8_model_path"] = set_fixture_from_cli_option(
         request, "--llama3-405b-fp8-model-path", "llama3_405b_fp8_model"
     )
+    model_path["google__t5_v1_1_small_fp32_model_path"] = set_fixture_from_cli_option(
+        request,
+        "--google-t5-v1-1-small-fp32-model-path",
+        "google__t5_v1_1_small_fp32_model",
+    )
+    model_path["google__t5_v1_1_xxl_fp32_model_path"] = set_fixture_from_cli_option(
+        request,
+        "--google-t5-v1-1-xxl-fp32-model-path",
+        "google__t5_v1_1_xxl_fp32_model",
+    )
     return model_path
 
 

diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -14,11 +14,11 @@
 (and indeed, can bootstrap these off of GGUF files).
 """
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 import torch
 
-__all__ = ["LlamaHParams", "LlamaModelConfig"]
+__all__ = ["LlamaHParams", "LlamaModelConfig", "T5Config"]
 
 
 @dataclass
@@ -179,3 +179,72 @@ class LlamaModelConfig:
     # be the difference of many gigabytes of static data being embedded in
     # the program and not.
     static_tables: bool = True
+
+
+@dataclass
+class T5Config:
+    return_dict: bool = True
+    output_hidden_states: bool = False
+    output_attentions: bool = False
+    is_encoder_decoder: bool = True
+    is_decoder: bool = False
+    vocab_size: int = 32128
+    d_model: int = 512
+    d_kv: int = 64
+    d_ff: int = 2048
+    num_layers: int = 6
+    num_decoder_layers: int = 6
+    num_heads: int = 8
+    relative_attention_num_buckets: int = 32
+    relative_attention_max_distance: int = 128
+    layer_norm_epsilon: float = 1e-6
+    feed_forward_proj: str = "relu"
+    is_gated_act: bool = field(init=False)
+    activation_dtype: torch.dtype = torch.float32
+    dense_act_fn: str = field(init=False)
+    use_cache: bool = True
+    pad_token_id: int = 0
+    eos_token_id: int = 1
+    decoder_start_token_id: int = 0
+
+    def __post_init__(self):
+        self.is_gated_act = self.feed_forward_proj.startswith("gated-")
+        self.dense_act_fn = (
+            self.feed_forward_proj.split("-")[1]
+            if "-" in self.feed_forward_proj
+            else self.feed_forward_proj
+        )
+        if self.dense_act_fn == "gelu":
+            self.dense_act_fn = "gelu_new"
+
+    @staticmethod
+    def from_gguf_properties(properties: dict[str, Any], **kwargs):
+        assert properties["general.architecture"] == "t5"
+        assert (
+            properties["t5.attention.layer_norm_epsilon"]
+            == properties["t5.attention.layer_norm_rms_epsilon"]
+        )
+
+        gguf_to_config_names_map = {
+            "t5.embedding_length": ["d_model"],
+            "t5.feed_forward_length": ["d_ff"],
+            "t5.block_count": ["num_layers", "num_decoder_layers"],
+            "t5.attention.head_count": ["num_heads"],
+            "t5.attention.key_length": ["d_kv"],
+            "t5.attention.layer_norm_epsilon": ["layer_norm_epsilon"],
+            "t5.attention.relative_buckets_count": ["relative_attention_num_buckets"],
+            "t5.decoder_start_token_id": ["decoder_start_token_id"],
+            "tokenizer.ggml.eos_token_id": ["eos_token_id"],
+            "tokenizer.ggml.padding_token_id": ["pad_token_id"],
+        }
+        all_kwargs = {"vocab_size": None, "feed_forward_proj": None}
+        all_kwargs.update(
+            {
+                config_name: properties[gguf_name]
+                for gguf_name, config_names in gguf_to_config_names_map.items()
+                for config_name in config_names
+            }
+        )
+        all_kwargs.update(kwargs)
+
+        return T5Config(**all_kwargs)
diff --git a/sharktank/sharktank/layers/ffn_block.py b/sharktank/sharktank/layers/ffn_block.py
@@ -4,11 +4,12 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from typing import Optional
+from typing import Optional, Callable
 
 import torch
 import torch.nn.functional as F
 from .. import ops
+from ..types import AnyTensor
 
 from .base import Theta, ThetaLayer
 from .linear import LinearLayer
@@ -22,18 +23,29 @@ class FFN(ThetaLayer):
     def __init__(
         self,
         theta: Theta,
+        is_gated: bool = True,
+        activation_fn: Callable[[AnyTensor], AnyTensor] = F.silu,
     ):
         super().__init__(theta)
 
-        self.add_module("ffn_gate", LinearLayer(theta("ffn_gate")))
+        self.is_gated = is_gated
+        self.activation_fn = activation_fn
+        if self.is_gated:
+            self.add_module("ffn_gate", LinearLayer(theta("ffn_gate")))
         self.add_module("ffn_up", LinearLayer(theta("ffn_up")))
         self.add_module("ffn_down", LinearLayer(theta("ffn_down")))
 
     def forward(
         self,
-        h: torch.Tensor,
-    ):
-        ffn_gate = ops.elementwise(F.silu, self.ffn_gate(h))
-        ffn_up = self.ffn_up(h)
-        ffn_down = self.ffn_down(ffn_gate * ffn_up)
-        return ffn_down
+        h: AnyTensor,
+    ) -> AnyTensor:
+        if self.is_gated:
+            ffn_gate = ops.elementwise(self.activation_fn, self.ffn_gate(h))
+            ffn_up = self.ffn_up(h)
+            ffn_down = self.ffn_down(ffn_gate * ffn_up)
+            return ffn_down
+        else:
+            h = self.ffn_up(h)
+            h = ops.elementwise(self.activation_fn, h)
+            h = self.ffn_down(h)
+            return h