diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
new file mode 100644
index 000000000..f52804008
--- /dev/null
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -0,0 +1,90 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: Llama Benchmarking Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekdays at 6:00 AM UTC = 11:00 PM PST.
+    - cron: "0 6 * * 1-5"
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  test_llama_large:
+    name: "Llama Benchmarking Tests"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300x-1
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+      VENV_DIR: ${{ github.workspace }}/.venv
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@v3
+
+      - name: Cache Pip Packages
+        uses: actions/cache@v4
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-compiler==20241104.1068 \
+            iree-runtime==20241104.1068 \
+            "numpy<2.0"
+
+      - name: Run llama tests
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out
+
+      - name: Upload llama executable files
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-files
+          path: ${{ github.workspace }}/${{ steps.date.outputs.date }}
diff --git a/.github/workflows/ci-llama.yaml b/.github/workflows/ci-llama-quick-tests.yaml
similarity index 86%
rename from .github/workflows/ci-llama.yaml
rename to .github/workflows/ci-llama-quick-tests.yaml
index 9d8d930f0..fec3f6635 100644
--- a/.github/workflows/ci-llama.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-name: Llama Benchmarking Tests
+name: Llama Benchmarking 8B Tests
 
 on:
   workflow_dispatch:
@@ -22,8 +22,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test_llama:
-    name: "Llama Benchmarking Tests"
+  test_llama_quick:
+    name: "Llama Benchmarking 8B Tests"
     strategy:
       matrix:
         version: [3.11]
@@ -71,18 +71,12 @@ jobs:
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-compiler \
-            iree-runtime \
+            iree-compiler==20241104.1068 \
+            iree-runtime==20241104.1068 \
             "numpy<2.0"
 
-      - name: Run llama test
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942 --html=out/index.html
-
-      # - name: Deploy to GitHub Pages
-      #   uses: peaceiris/actions-gh-pages@v3
-      #   with:
-      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-      #     publish_dir: ./out
+      - name: Run llama 8b tests
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@v4
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index e88974d16..ed09a1fd1 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -69,7 +69,23 @@ def pytest_addoption(parser):
         action="store_true",
         dest="longrun",
         default=False,
-        help="Enable long and slow tests",
+        help="Enable long tests",
+    )
+
+    parser.addoption(
+        "--run-8b-llama",
+        action="store_true",
+        dest="run-8b-llama",
+        default=False,
+        help="Enable llama 8b benchmarking tests",
+    )
+
+    parser.addoption(
+        "--run-all-llama",
+        action="store_true",
+        dest="run-all-llama",
+        default=False,
+        help="Enable all llama benchmarking tests",
     )
 
     # TODO: Remove all hardcoded paths in CI tests
diff --git a/sharktank/sharktank/examples/sharding/shard_llm_dataset.py b/sharktank/sharktank/examples/sharding/shard_llm_dataset.py
index 67a2a381d..91e88d071 100644
--- a/sharktank/sharktank/examples/sharding/shard_llm_dataset.py
+++ b/sharktank/sharktank/examples/sharding/shard_llm_dataset.py
@@ -30,6 +30,14 @@ def main(raw_args=None):
     args = cli.parse(parser, args=raw_args)
     dataset = cli.get_input_dataset(args)
 
+    if args.output_irpa_file is None:
+        raise RuntimeError(f"Need file destination for IRPA file")
+
+    if args.tensor_parallelism_size < 2:
+        raise RuntimeError(
+            f"Expect sharding greater than 1 found {args.tensor_parallelism_size}"
+        )
+
     hp = LlamaHParams.from_gguf_props(dataset.properties)
     llama_config = LlamaModelConfig(
         hp, tensor_parallelism_size=args.tensor_parallelism_size
diff --git a/sharktank/sharktank/models/llama/tools/shard_llama.py b/sharktank/sharktank/models/llama/tools/shard_llama.py
deleted file mode 100644
index bd1ffa696..000000000
--- a/sharktank/sharktank/models/llama/tools/shard_llama.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2024 Advanced Micro Devices, Inc.
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from pathlib import Path
-import re
-
-import numpy as np
-import torch
-
-from ....layers import configs
-from ..llama import LlamaModelConfig
-from ..sharding import shard_theta
-
-
-def main():
-    from ....utils import cli
-
-    parser = cli.create_parser()
-    cli.add_input_dataset_options(parser)
-    parser.add_argument(
-        "--output-file", type=Path, help="Save the dataset to an IRPA file"
-    )
-    parser.add_argument(
-        "--shard_count",
-        required=True,
-        type=int,
-        help="Level of parallelism in sharding",
-    )
-    args = cli.parse(parser)
-    dataset = cli.get_input_dataset(args)
-
-    if args.output_file is None:
-        raise RuntimeError(f"Need file destination for IRPA file")
-
-    if args.shard_count < 2:
-        raise RuntimeError(f"Expect sharding greater than 1 found {args.shard_count}")
-
-    hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
-    llama_config = LlamaModelConfig(hp)
-    llama_config.kv_cache_type = "paged"
-    llama_config.tensor_parallelism_size = args.shard_count
-    dataset.root_theta = shard_theta(dataset.root_theta, llama_config)
-
-    def report(s):
-        print(f"Save: {s}")
-
-    print(f"Saving to: {args.output_file}")
-    dataset.save(args.output_file, io_report_callback=report)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 658a14439..057d3b664 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -123,17 +123,18 @@ def wrapper(*args, **kwargs):
     def shard_irpa_file(
         self,
         *,
-        output_file: str,
+        gguf_file: str,
+        output_irpa: str,
     ):
         shard_irpa_args = [
             "python3",
             "-m",
-            "sharktank.models.llama.tools.shard_llama",
-            "--irpa-file",
-            self.irpa_path,
-            "--output-file",
-            output_file,
-            "--shard_count",
+            "sharktank.examples.sharding.shard_llm_dataset",
+            "--gguf-file",
+            gguf_file,
+            "--output-irpa-file",
+            output_irpa,
+            "--tensor-parallelism-size",
             str(self.tensor_parallelism_size),
         ]
 
@@ -145,7 +146,7 @@ def shard_irpa_file(
         proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd, text=True)
         if proc.returncode != 0:
             logger.error(
-                f"Error sharding irpa file with shard_llama.py\n"
+                f"Error sharding irpa file with shard_llm_dataset.py\n"
                 f"{proc.stdout+proc.stderr}"
             )
         else:
@@ -203,6 +204,12 @@ def compile_to_vmfb(
             f"--iree-hal-target-backends={self.iree_hal_target_backends}",
             f"-o={vmfb_path}",
         ]
+        if self.tensor_parallelism_size > 1:
+            iree_hal_target_devices = [
+                f"--iree-hal-target-device=hip[{i}]"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            compile_args += iree_hal_target_devices
         if hal_dump_path:
             compile_args += [
                 f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"
@@ -234,16 +241,34 @@ def iree_benchmark_vmfb(
             compile_cmd: Command used to compile the program, for inclusion in error messages.
         Raises Exception if running fails for some reason.
         """
-        benchmark_args = [
-            f"ROCR_VISIBLE_DEVICES={hip_device_id}",
+        benchmark_args = []
+        if self.tensor_parallelism_size > 1:
+            base_irpa_path, _ = os.path.splitext(irpa_path)
+            rocr_visible_devices = [
+                f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(self.tensor_parallelism_size))}"
+            ]
+            params = [f"--parameters=model={base_irpa_path}.irpa"]
+            params += [
+                f"--parameters=model={base_irpa_path}.rank{i}.irpa"
+                for i in range(self.tensor_parallelism_size)
+            ]
+            devices = [
+                f"--device=hip://{i}" for i in range(self.tensor_parallelism_size)
+            ]
+        else:
+            rocr_visible_devices = [f"ROCR_VISIBLE_DEVICES={hip_device_id}"]
+            params = [f"--parameters=model={irpa_path}"]
+            devices = [f"--device=hip://{hip_device_id}"]
+        benchmark_args += rocr_visible_devices
+        benchmark_args += [
             "iree-benchmark-module",
-            f"--device=hip://{hip_device_id}",
             "--hip_use_streams=true",
             "--hip_allow_inline_execution=true",
             "--device_allocator=caching",
             f"--module={vmfb_name}",
-            f"--parameters=model={irpa_path}",
         ]
+        benchmark_args += params
+        benchmark_args += devices
         benchmark_args += args
         cmd = subprocess.list2cmdline(benchmark_args)
         logging.getLogger().info(f"Launching run command:\n" f"cd {cwd} && {cmd}")
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index ae2b3b35a..adbfeaf7e 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -20,8 +20,11 @@
     IreeCompileException,
 )
 
-longrun = pytest.mark.skipif("not config.getoption('longrun')")
 is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
+skipif_run_8b_llama = pytest.mark.skipif(
+    'config.getoption("run-8b-llama") and not config.getoption("run-all-llama")',
+    reason="Skipping largs tests when --run-8b is set.",
+)
 
 
 @pytest.mark.usefixtures("get_iree_flags")
@@ -48,18 +51,20 @@ def setUp(self):
         self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0")
 
 
+@is_mi300x
 class BenchmarkLlama3_1_8B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/weights/8b")
+        self.gguf_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.gguf"
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa"
         self.tensor_parallelism_size = 1
         self.dir_path_8b = self.dir_path / "llama-8b"
         self.temp_dir_8b = Path(self.dir_path_8b)
         self.temp_dir_8b.mkdir(parents=True, exist_ok=True)
-        self.llama8b_f16_artifacts = ExportArtifacts(
+        self.llama8b_f16_decomposed_artifacts = ExportArtifacts(
             irpa_path=str(self.irpa_path),
             batch_size=4,
             iree_hip_target="gfx942",
@@ -67,10 +72,30 @@ def setUp(self):
             attention_kernel="decomposed",
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
-        self.iree_compile_args = [
-            "--iree-hal-target-backends=rocm",
-            f"--iree-hip-target={self.iree_hip_target}",
-        ]
+        self.llama8b_f16_torch_sdpa_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="torch",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama8b_fp8_decomposed_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path_fp8),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="decomposed",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama8b_fp8_torch_sdpa_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path_fp8),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="torch",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
@@ -110,41 +135,29 @@ def setUp(self):
             "--benchmark_repetitions=3",
         ]
 
-    @longrun
-    @is_mi300x
     def testBenchmark8B_f16_Decomposed(self):
         output_file_name = self.dir_path_8b / "f16_decomposed"
-        output_mlir = self.llama8b_f16_artifacts.create_file(
+        output_mlir = self.llama8b_f16_decomposed_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
-        output_json = self.llama8b_f16_artifacts.create_file(
+        output_json = self.llama8b_f16_decomposed_artifacts.create_file(
             suffix=".json", prefix=output_file_name
         )
-        output_vmfb = self.llama8b_f16_artifacts.create_file(
+        output_vmfb = self.llama8b_f16_decomposed_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
-        output_shard_file_name = str(
-            self.artifacts_dir
-            / f"llama3.1_8b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
-        )
-        # shard_irpa file
-        shard_return_code = self.llama8b_f16_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
-        )
-        if shard_return_code == 0:
-            self.irpa_path = output_shard_file_name
-        export_return_code = self.llama8b_f16_artifacts.export_to_mlir(
+        export_return_code = self.llama8b_f16_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
         )
-        self.llama8b_f16_artifacts.compile_to_vmfb(
+        self.llama8b_f16_decomposed_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.llama8b_f16_artifacts.iree_benchmark_vmfb(
+        self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -152,7 +165,7 @@ def testBenchmark8B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.llama8b_f16_artifacts.iree_benchmark_vmfb(
+        self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -160,43 +173,31 @@ def testBenchmark8B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "f16_torch"
-        output_mlir = self.llama8b_f16_artifacts.create_file(
+        output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
-        output_json = self.llama8b_f16_artifacts.create_file(
+        output_json = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".json", prefix=output_file_name
         )
-        output_vmfb = self.llama8b_f16_artifacts.create_file(
+        output_vmfb = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
-        self.llama8b_f16_artifacts.attention_kernel = "torch"
-        output_shard_file_name = str(
-            self.artifacts_dir
-            / f"llama3.1_8b_fp16_tp{self.tensor_parallelism_size}_parameters_torch_sdpa.irpa"
-        )
-        # shard_irpa file
-        shard_return_code = self.llama8b_f16_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
-        )
-        if shard_return_code == 0:
-            self.irpa_path = output_shard_file_name
-        export_return_code = self.llama8b_f16_artifacts.export_to_mlir(
+        self.llama8b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
+        export_return_code = self.llama8b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
         )
-        self.llama8b_f16_artifacts.compile_to_vmfb(
+        self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.llama8b_f16_artifacts.iree_benchmark_vmfb(
+        self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -204,7 +205,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.llama8b_f16_artifacts.iree_benchmark_vmfb(
+        self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -212,35 +213,32 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Test not yet implemented", strict=True, raises=ExportMlirException
     )
     def testBenchmark8B_fp8_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_decomposed"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="decomposed",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path_fp8,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+        output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama8b_fp8_decomposed_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama8b_fp8_decomposed_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        export_return_code = self.llama8b_fp8_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=self.iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama8b_fp8_decomposed_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
@@ -248,7 +246,7 @@ def testBenchmark8B_fp8_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
@@ -256,70 +254,98 @@ def testBenchmark8B_fp8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Test not yet implemented", strict=True, raises=ExportMlirException
     )
     def testBenchmark8B_fp8_Non_Decomposed(self):
-        output_file_name = self.dir_path_8b / "fp8_torch_sdpa"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="torch_sdpa",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path_fp8,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+        output_file_name = self.dir_path_8b / "fp8_torch"
+        output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        export_return_code = self.llama8b_fp8_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=self.iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama8b_fp8_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_prefill_args_fp8,
+            args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_decode_args_fp8,
+            args=self.iree_run_decode_args,
             cwd=self.repo_root,
         )
 
 
+@is_mi300x
+@skipif_run_8b_llama
 class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        artifacts_dir = Path("/data/llama-3.1/weights/70b")
-        self.irpa_path = artifacts_dir / "fp16/llama3.1_70b_f16.irpa"
-        self.irpa_path_fp8 = artifacts_dir / "f8/llama70b_fp8.irpa"
-        self.tensor_parallelism_size = 1
+        self.artifacts_dir = Path("/data/llama-3.1/weights/70b")
+        self.gguf_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.gguf"
+        self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa"
+        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa"
+        self.tensor_parallelism_size = 8
         self.dir_path_70b = self.dir_path / "llama-70b"
         self.temp_dir_70b = Path(self.dir_path_70b)
         self.temp_dir_70b.mkdir(parents=True, exist_ok=True)
-        self.iree_compile_args = [
-            "--iree-hal-target-backends=rocm",
-            f"--iree-hip-target={self.iree_hip_target}",
-        ]
-        self.prefill_args_f16 = artifacts_dir / "prefill_args"
-        self.decode_args_f16 = artifacts_dir / "decode_args"
-        self.prefill_args_fp8 = artifacts_dir / "prefill_args_fp8"
-        self.decode_args_fp8 = artifacts_dir / "decode_args_fp8"
+        self.llama70b_f16_decomposed_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="decomposed",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama70b_f16_torch_sdpa_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="torch",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama70b_fp8_decomposed_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path_fp8),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="decomposed",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama70b_fp8_torch_sdpa_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path_fp8),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="torch",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.decode_args_f16 = self.artifacts_dir / "decode_args"
+        self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
+        self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
         self.iree_run_prefill_args = [
             "--function=prefill_bs4",
             f"--input=@{self.prefill_args_f16}/tokens.npy",
@@ -355,35 +381,38 @@ def setUp(self):
             "--benchmark_repetitions=3",
         ]
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
-    def testBenchmark70B_f16_Decomposed(self):
+    def testBenchmark70B_f16_TP8_Decomposed(self):
         output_file_name = self.dir_path_70b / "f16_decomposed"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="decomposed",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+        output_mlir = self.llama70b_f16_decomposed_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama70b_f16_decomposed_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama70b_f16_decomposed_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama70b_f16_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama70b_f16_decomposed_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -391,7 +420,7 @@ def testBenchmark70B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -399,35 +428,36 @@ def testBenchmark70B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
-    )
-    def testBenchmark70B_f16_Non_Decomposed(self):
-        output_file_name = self.dir_path_70b / "f16_torch_sdpa"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="torch_sdpa",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
+    def testBenchmark70B_f16_TP8_Non_Decomposed(self):
+        output_file_name = self.dir_path_70b / "f16_torch"
+        output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -435,7 +465,7 @@ def testBenchmark70B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -443,35 +473,38 @@ def testBenchmark70B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Test not yet implemented", strict=True, raises=ExportMlirException
     )
-    def testBenchmark70B_fp8_Decomposed(self):
+    def testBenchmark70B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_decomposed"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="decomposed",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path_fp8,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+        output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama70b_fp8_decomposed_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama70b_fp8_decomposed_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"f8/tp8/llama3.1_70b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama70b_fp8_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=self.iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama70b_fp8_decomposed_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
@@ -479,7 +512,7 @@ def testBenchmark70B_fp8_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
@@ -487,70 +520,104 @@ def testBenchmark70B_fp8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Test not yet implemented", strict=True, raises=ExportMlirException
     )
-    def testBenchmark70B_fp8_Non_Decomposed(self):
-        output_file_name = self.dir_path_70b / "fp8_torch_sdpa"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="torch_sdpa",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path_fp8,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+    def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
+        output_file_name = self.dir_path_70b / "fp8_torch"
+        output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"f8/tp8/llama3.1_70b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama70b_fp8_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=self.iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama70b_fp8_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_prefill_args_fp8,
+            args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_decode_args_fp8,
+            args=self.iree_run_decode_args,
             cwd=self.repo_root,
         )
 
 
+@is_mi300x
+@skipif_run_8b_llama
 class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        artifacts_dir = Path("/data/llama-3.1/weights/405b")
-        self.irpa_path = artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
-        self.irpa_path_fp8 = artifacts_dir / "f8/llama405b_fp8.irpa"
+        self.artifacts_dir = Path("/data/llama-3.1/weights/405b")
+        self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
+        self.gguf_path = self.artifacts_dir / "fp16/llama3_405b_f16.gguf"
+        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa"
         self.tensor_parallelism_size = 8
         self.dir_path_405b = self.dir_path / "llama-405b"
         self.temp_dir_405b = Path(self.dir_path_405b)
         self.temp_dir_405b.mkdir(parents=True, exist_ok=True)
-        self.iree_compile_args = [
-            "--iree-hal-target-backends=rocm",
-            f"--iree-hip-target={self.iree_hip_target}",
-        ]
-        self.prefill_args_f16 = artifacts_dir / "prefill_args"
-        self.decode_args_f16 = artifacts_dir / "decode_args"
-        self.prefill_args_fp8 = artifacts_dir / "prefill_args_fp8"
-        self.decode_args_fp8 = artifacts_dir / "decode_args_fp8"
+        self.llama405b_f16_decomposed_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="decomposed",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama405b_f16_torch_sdpa_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="torch",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama405b_fp8_decomposed_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path_fp8),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="decomposed",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.llama405b_fp8_torch_sdpa_artifacts = ExportArtifacts(
+            irpa_path=str(self.irpa_path_fp8),
+            batch_size=4,
+            iree_hip_target="gfx942",
+            iree_hal_target_backends="rocm",
+            attention_kernel="torch",
+            tensor_parallelism_size=self.tensor_parallelism_size,
+        )
+        self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.decode_args_f16 = self.artifacts_dir / "decode_args"
+        self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
+        self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
         self.iree_run_prefill_args = [
             "--function=prefill_bs4",
             f"--input=@{self.prefill_args_f16}/tokens.npy",
@@ -586,35 +653,38 @@ def setUp(self):
             "--benchmark_repetitions=3",
         ]
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
-    def testBenchmark405B_f16_Decomposed(self):
+    def testBenchmark405B_f16_TP8_Decomposed(self):
         output_file_name = self.dir_path_405b / "f16_decomposed"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="decomposed",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+        output_mlir = self.llama405b_f16_decomposed_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama405b_f16_decomposed_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama405b_f16_decomposed_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama405b_f16_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama405b_f16_decomposed_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -622,7 +692,7 @@ def testBenchmark405B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -630,35 +700,36 @@ def testBenchmark405B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
-    )
-    def testBenchmark405B_f16_Non_Decomposed(self):
-        output_file_name = self.dir_path_405b / "f16_torch_sdpa"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="torch_sdpa",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
+    def testBenchmark405B_f16_TP8_Non_Decomposed(self):
+        output_file_name = self.dir_path_405b / "f16_torch"
+        output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -666,7 +737,7 @@ def testBenchmark405B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -674,91 +745,97 @@ def testBenchmark405B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Test not yet implemented", strict=True, raises=ExportMlirException
     )
-    def testBenchmark405B_fp8_Decomposed(self):
+    def testBenchmark405B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_decomposed"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="decomposed",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path_fp8,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+        output_mlir = self.llama405b_fp8_decomposed_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama405b_fp8_decomposed_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama405b_fp8_decomposed_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama405b_fp8_decomposed_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=self.iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama405b_fp8_decomposed_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
-            irpa_path=self.irpa_path,
+            irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
-            irpa_path=self.irpa_path,
+            irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,
             cwd=self.repo_root,
         )
 
-    @longrun
-    @is_mi300x
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=AttributeError
+        reason="Test not yet implemented", strict=True, raises=ExportMlirException
     )
-    def testBenchmark405B_fp8_Non_Decomposed(self):
-        output_file_name = self.dir_path_405b / "fp8_torch_sdpa"
-        output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
-        output_json = self.create_file(suffix=".json", prefix=output_file_name)
-        output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name)
-        self.export_mlir(
-            attention_kernel="torch_sdpa",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            irpa_path=self.irpa_path_fp8,
-            output_mlir_path=output_mlir,
-            output_json_path=output_json,
-            cwd=self.repo_root,
+    def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
+        output_file_name = self.dir_path_405b / "fp8_torch"
+        output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
         )
-        iree_compile_args = self.iree_compile_args + [
-            f"--iree-hal-dump-executable-files-to={output_file_name}/files"
-        ]
-        self.iree_compile(
+        output_json = self.llama405b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama405b_fp8_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        output_shard_file_name = (
+            self.artifacts_dir
+            / f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
+        )
+        if output_shard_file_name.exists():
+            self.irpa_path = output_shard_file_name
+        export_return_code = self.llama405b_fp8_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
-            output_vmfb_path=output_vmfb,
-            args=self.iree_compile_args,
+            json_path=output_json,
+        )
+        self.llama405b_fp8_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
             cwd=self.repo_root,
         )
         # benchmark prefill
-        self.iree_benchmark_module(
+        self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_prefill_args_fp8,
+            args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.iree_benchmark_module(
+        self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
-            args=self.iree_run_decode_args_fp8,
+            args=self.iree_run_decode_args,
             cwd=self.repo_root,
         )