diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml new file mode 100644 index 000000000..f52804008 --- /dev/null +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -0,0 +1,90 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +name: Llama Benchmarking Tests + +on: + workflow_dispatch: + schedule: + # Weekdays at 6:00 AM UTC = 11:00 PM PST. + - cron: "0 6 * * 1-5" + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + test_llama_large: + name: "Llama Benchmarking Tests" + strategy: + matrix: + version: [3.11] + fail-fast: false + runs-on: llama-mi300x-1 + defaults: + run: + shell: bash + env: + PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache" + VENV_DIR: ${{ github.workspace }}/.venv + steps: + - name: Get Current Date + id: date + run: echo "::set-output name=date::$(date +'%Y-%m-%d')" + + - name: "Setting up Python" + id: setup_python + uses: actions/setup-python@v3 + with: + python-version: ${{matrix.version}} + + - name: "Checkout Code" + uses: actions/checkout@v3 + + - name: Cache Pip Packages + uses: actions/cache@v4 + id: cache-pip + with: + path: ${{ env.PIP_CACHE_DIR }} + key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }} + + - name: Install pip deps + run: | + python -m pip install --no-compile --upgrade pip + # Note: We install in three steps in order to satisfy requirements + # from non default locations first. Installing the PyTorch CPU + # wheels saves multiple minutes and a lot of bandwidth on runner setup. + pip install --no-compile -r pytorch-cpu-requirements.txt + pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ + -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" + pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ + + # Try with the latest nightly releases, not what iree-turbine pins. + # We could also pin to a known working or stable version. + # This should eventually stabilize. Do the best we can for now. + pip install -f https://iree.dev/pip-release-links.html --upgrade \ + iree-compiler==20241104.1068 \ + iree-runtime==20241104.1068 \ + "numpy<2.0" + + - name: Run llama tests + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} + publish_dir: ./out + + - name: Upload llama executable files + uses: actions/upload-artifact@v4 + with: + name: llama-files + path: ${{ github.workspace }}/${{ steps.date.outputs.date }} diff --git a/.github/workflows/ci-llama.yaml b/.github/workflows/ci-llama-quick-tests.yaml similarity index 86% rename from .github/workflows/ci-llama.yaml rename to .github/workflows/ci-llama-quick-tests.yaml index 9d8d930f0..fec3f6635 100644 --- a/.github/workflows/ci-llama.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -name: Llama Benchmarking Tests +name: Llama Benchmarking 8B Tests on: workflow_dispatch: @@ -22,8 +22,8 @@ concurrency: cancel-in-progress: true jobs: - test_llama: - name: "Llama Benchmarking Tests" + test_llama_quick: + name: "Llama Benchmarking 8B Tests" strategy: matrix: version: [3.11] @@ -71,18 +71,12 @@ jobs: # We could also pin to a known working or stable version. # This should eventually stabilize. Do the best we can for now. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-compiler \ - iree-runtime \ + iree-compiler==20241104.1068 \ + iree-runtime==20241104.1068 \ "numpy<2.0" - - name: Run llama test - run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942 --html=out/index.html - - # - name: Deploy to GitHub Pages - # uses: peaceiris/actions-gh-pages@v3 - # with: - # github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} - # publish_dir: ./out + - name: Run llama 8b tests + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama - name: Upload llama executable files uses: actions/upload-artifact@v4 diff --git a/sharktank/conftest.py b/sharktank/conftest.py index e88974d16..ed09a1fd1 100644 --- a/sharktank/conftest.py +++ b/sharktank/conftest.py @@ -69,7 +69,23 @@ def pytest_addoption(parser): action="store_true", dest="longrun", default=False, - help="Enable long and slow tests", + help="Enable long tests", + ) + + parser.addoption( + "--run-8b-llama", + action="store_true", + dest="run-8b-llama", + default=False, + help="Enable llama 8b benchmarking tests", + ) + + parser.addoption( + "--run-all-llama", + action="store_true", + dest="run-all-llama", + default=False, + help="Enable all llama benchmarking tests", ) # TODO: Remove all hardcoded paths in CI tests diff --git a/sharktank/sharktank/examples/sharding/shard_llm_dataset.py b/sharktank/sharktank/examples/sharding/shard_llm_dataset.py index 67a2a381d..91e88d071 100644 --- a/sharktank/sharktank/examples/sharding/shard_llm_dataset.py +++ b/sharktank/sharktank/examples/sharding/shard_llm_dataset.py @@ -30,6 +30,14 @@ def main(raw_args=None): args = cli.parse(parser, args=raw_args) dataset = cli.get_input_dataset(args) + if args.output_irpa_file is None: + raise RuntimeError(f"Need file destination for IRPA file") + + if args.tensor_parallelism_size < 2: + raise RuntimeError( + f"Expect sharding greater than 1 found {args.tensor_parallelism_size}" + ) + hp = LlamaHParams.from_gguf_props(dataset.properties) llama_config = LlamaModelConfig( hp, tensor_parallelism_size=args.tensor_parallelism_size diff --git a/sharktank/sharktank/models/llama/tools/shard_llama.py b/sharktank/sharktank/models/llama/tools/shard_llama.py deleted file mode 100644 index bd1ffa696..000000000 --- a/sharktank/sharktank/models/llama/tools/shard_llama.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2024 Advanced Micro Devices, Inc. -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -from pathlib import Path -import re - -import numpy as np -import torch - -from ....layers import configs -from ..llama import LlamaModelConfig -from ..sharding import shard_theta - - -def main(): - from ....utils import cli - - parser = cli.create_parser() - cli.add_input_dataset_options(parser) - parser.add_argument( - "--output-file", type=Path, help="Save the dataset to an IRPA file" - ) - parser.add_argument( - "--shard_count", - required=True, - type=int, - help="Level of parallelism in sharding", - ) - args = cli.parse(parser) - dataset = cli.get_input_dataset(args) - - if args.output_file is None: - raise RuntimeError(f"Need file destination for IRPA file") - - if args.shard_count < 2: - raise RuntimeError(f"Expect sharding greater than 1 found {args.shard_count}") - - hp = configs.LlamaHParams.from_gguf_props(dataset.properties) - llama_config = LlamaModelConfig(hp) - llama_config.kv_cache_type = "paged" - llama_config.tensor_parallelism_size = args.shard_count - dataset.root_theta = shard_theta(dataset.root_theta, llama_config) - - def report(s): - print(f"Save: {s}") - - print(f"Saving to: {args.output_file}") - dataset.save(args.output_file, io_report_callback=report) - - -if __name__ == "__main__": - main() diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 658a14439..057d3b664 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -123,17 +123,18 @@ def wrapper(*args, **kwargs): def shard_irpa_file( self, *, - output_file: str, + gguf_file: str, + output_irpa: str, ): shard_irpa_args = [ "python3", "-m", - "sharktank.models.llama.tools.shard_llama", - "--irpa-file", - self.irpa_path, - "--output-file", - output_file, - "--shard_count", + "sharktank.examples.sharding.shard_llm_dataset", + "--gguf-file", + gguf_file, + "--output-irpa-file", + output_irpa, + "--tensor-parallelism-size", str(self.tensor_parallelism_size), ] @@ -145,7 +146,7 @@ def shard_irpa_file( proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd, text=True) if proc.returncode != 0: logger.error( - f"Error sharding irpa file with shard_llama.py\n" + f"Error sharding irpa file with shard_llm_dataset.py\n" f"{proc.stdout+proc.stderr}" ) else: @@ -203,6 +204,12 @@ def compile_to_vmfb( f"--iree-hal-target-backends={self.iree_hal_target_backends}", f"-o={vmfb_path}", ] + if self.tensor_parallelism_size > 1: + iree_hal_target_devices = [ + f"--iree-hal-target-device=hip[{i}]" + for i in range(self.tensor_parallelism_size) + ] + compile_args += iree_hal_target_devices if hal_dump_path: compile_args += [ f"--iree-hal-dump-executable-files-to={hal_dump_path}/files" @@ -234,16 +241,34 @@ def iree_benchmark_vmfb( compile_cmd: Command used to compile the program, for inclusion in error messages. Raises Exception if running fails for some reason. """ - benchmark_args = [ - f"ROCR_VISIBLE_DEVICES={hip_device_id}", + benchmark_args = [] + if self.tensor_parallelism_size > 1: + base_irpa_path, _ = os.path.splitext(irpa_path) + rocr_visible_devices = [ + f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(self.tensor_parallelism_size))}" + ] + params = [f"--parameters=model={base_irpa_path}.irpa"] + params += [ + f"--parameters=model={base_irpa_path}.rank{i}.irpa" + for i in range(self.tensor_parallelism_size) + ] + devices = [ + f"--device=hip://{i}" for i in range(self.tensor_parallelism_size) + ] + else: + rocr_visible_devices = [f"ROCR_VISIBLE_DEVICES={hip_device_id}"] + params = [f"--parameters=model={irpa_path}"] + devices = [f"--device=hip://{hip_device_id}"] + benchmark_args += rocr_visible_devices + benchmark_args += [ "iree-benchmark-module", - f"--device=hip://{hip_device_id}", "--hip_use_streams=true", "--hip_allow_inline_execution=true", "--device_allocator=caching", f"--module={vmfb_name}", - f"--parameters=model={irpa_path}", ] + benchmark_args += params + benchmark_args += devices benchmark_args += args cmd = subprocess.list2cmdline(benchmark_args) logging.getLogger().info(f"Launching run command:\n" f"cd {cwd} && {cmd}") diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index ae2b3b35a..adbfeaf7e 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -20,8 +20,11 @@ IreeCompileException, ) -longrun = pytest.mark.skipif("not config.getoption('longrun')") is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'") +skipif_run_8b_llama = pytest.mark.skipif( + 'config.getoption("run-8b-llama") and not config.getoption("run-all-llama")', + reason="Skipping largs tests when --run-8b is set.", +) @pytest.mark.usefixtures("get_iree_flags") @@ -48,18 +51,20 @@ def setUp(self): self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0") +@is_mi300x class BenchmarkLlama3_1_8B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it self.artifacts_dir = Path("/data/llama-3.1/weights/8b") + self.gguf_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.gguf" self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa" self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa" self.tensor_parallelism_size = 1 self.dir_path_8b = self.dir_path / "llama-8b" self.temp_dir_8b = Path(self.dir_path_8b) self.temp_dir_8b.mkdir(parents=True, exist_ok=True) - self.llama8b_f16_artifacts = ExportArtifacts( + self.llama8b_f16_decomposed_artifacts = ExportArtifacts( irpa_path=str(self.irpa_path), batch_size=4, iree_hip_target="gfx942", @@ -67,10 +72,30 @@ def setUp(self): attention_kernel="decomposed", tensor_parallelism_size=self.tensor_parallelism_size, ) - self.iree_compile_args = [ - "--iree-hal-target-backends=rocm", - f"--iree-hip-target={self.iree_hip_target}", - ] + self.llama8b_f16_torch_sdpa_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="torch", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama8b_fp8_decomposed_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path_fp8), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="decomposed", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama8b_fp8_torch_sdpa_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path_fp8), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="torch", + tensor_parallelism_size=self.tensor_parallelism_size, + ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" @@ -110,41 +135,29 @@ def setUp(self): "--benchmark_repetitions=3", ] - @longrun - @is_mi300x def testBenchmark8B_f16_Decomposed(self): output_file_name = self.dir_path_8b / "f16_decomposed" - output_mlir = self.llama8b_f16_artifacts.create_file( + output_mlir = self.llama8b_f16_decomposed_artifacts.create_file( suffix=".mlir", prefix=output_file_name ) - output_json = self.llama8b_f16_artifacts.create_file( + output_json = self.llama8b_f16_decomposed_artifacts.create_file( suffix=".json", prefix=output_file_name ) - output_vmfb = self.llama8b_f16_artifacts.create_file( + output_vmfb = self.llama8b_f16_decomposed_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) - output_shard_file_name = str( - self.artifacts_dir - / f"llama3.1_8b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" - ) - # shard_irpa file - shard_return_code = self.llama8b_f16_artifacts.shard_irpa_file( - output_file=output_shard_file_name - ) - if shard_return_code == 0: - self.irpa_path = output_shard_file_name - export_return_code = self.llama8b_f16_artifacts.export_to_mlir( + export_return_code = self.llama8b_f16_decomposed_artifacts.export_to_mlir( mlir_path=output_mlir, json_path=output_json, ) - self.llama8b_f16_artifacts.compile_to_vmfb( + self.llama8b_f16_decomposed_artifacts.compile_to_vmfb( mlir_path=str(output_mlir), vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.llama8b_f16_artifacts.iree_benchmark_vmfb( + self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -152,7 +165,7 @@ def testBenchmark8B_f16_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.llama8b_f16_artifacts.iree_benchmark_vmfb( + self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -160,43 +173,31 @@ def testBenchmark8B_f16_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed(self): output_file_name = self.dir_path_8b / "f16_torch" - output_mlir = self.llama8b_f16_artifacts.create_file( + output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file( suffix=".mlir", prefix=output_file_name ) - output_json = self.llama8b_f16_artifacts.create_file( + output_json = self.llama8b_f16_torch_sdpa_artifacts.create_file( suffix=".json", prefix=output_file_name ) - output_vmfb = self.llama8b_f16_artifacts.create_file( + output_vmfb = self.llama8b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) - self.llama8b_f16_artifacts.attention_kernel = "torch" - output_shard_file_name = str( - self.artifacts_dir - / f"llama3.1_8b_fp16_tp{self.tensor_parallelism_size}_parameters_torch_sdpa.irpa" - ) - # shard_irpa file - shard_return_code = self.llama8b_f16_artifacts.shard_irpa_file( - output_file=output_shard_file_name - ) - if shard_return_code == 0: - self.irpa_path = output_shard_file_name - export_return_code = self.llama8b_f16_artifacts.export_to_mlir( + self.llama8b_f16_torch_sdpa_artifacts.attention_kernel = "torch" + export_return_code = self.llama8b_f16_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, json_path=output_json, ) - self.llama8b_f16_artifacts.compile_to_vmfb( + self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb( mlir_path=str(output_mlir), vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.llama8b_f16_artifacts.iree_benchmark_vmfb( + self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -204,7 +205,7 @@ def testBenchmark8B_f16_Non_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.llama8b_f16_artifacts.iree_benchmark_vmfb( + self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -212,35 +213,32 @@ def testBenchmark8B_f16_Non_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Test not yet implemented", strict=True, raises=ExportMlirException ) def testBenchmark8B_fp8_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_decomposed" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="decomposed", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path_fp8, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama8b_fp8_decomposed_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama8b_fp8_decomposed_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + export_return_code = self.llama8b_fp8_decomposed_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=self.iree_compile_args, + json_path=output_json, + ) + self.llama8b_fp8_decomposed_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, @@ -248,7 +246,7 @@ def testBenchmark8B_fp8_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, @@ -256,70 +254,98 @@ def testBenchmark8B_fp8_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Test not yet implemented", strict=True, raises=ExportMlirException ) def testBenchmark8B_fp8_Non_Decomposed(self): - output_file_name = self.dir_path_8b / "fp8_torch_sdpa" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="torch_sdpa", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path_fp8, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + output_file_name = self.dir_path_8b / "fp8_torch" + output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama8b_fp8_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama8b_fp8_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + export_return_code = self.llama8b_fp8_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=self.iree_compile_args, + json_path=output_json, + ) + self.llama8b_fp8_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, - args=self.iree_run_prefill_args_fp8, + args=self.iree_run_prefill_args, cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, - args=self.iree_run_decode_args_fp8, + args=self.iree_run_decode_args, cwd=self.repo_root, ) +@is_mi300x +@skipif_run_8b_llama class BenchmarkLlama3_1_70B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - artifacts_dir = Path("/data/llama-3.1/weights/70b") - self.irpa_path = artifacts_dir / "fp16/llama3.1_70b_f16.irpa" - self.irpa_path_fp8 = artifacts_dir / "f8/llama70b_fp8.irpa" - self.tensor_parallelism_size = 1 + self.artifacts_dir = Path("/data/llama-3.1/weights/70b") + self.gguf_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.gguf" + self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa" + self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa" + self.tensor_parallelism_size = 8 self.dir_path_70b = self.dir_path / "llama-70b" self.temp_dir_70b = Path(self.dir_path_70b) self.temp_dir_70b.mkdir(parents=True, exist_ok=True) - self.iree_compile_args = [ - "--iree-hal-target-backends=rocm", - f"--iree-hip-target={self.iree_hip_target}", - ] - self.prefill_args_f16 = artifacts_dir / "prefill_args" - self.decode_args_f16 = artifacts_dir / "decode_args" - self.prefill_args_fp8 = artifacts_dir / "prefill_args_fp8" - self.decode_args_fp8 = artifacts_dir / "decode_args_fp8" + self.llama70b_f16_decomposed_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="decomposed", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama70b_f16_torch_sdpa_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="torch", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama70b_fp8_decomposed_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path_fp8), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="decomposed", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama70b_fp8_torch_sdpa_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path_fp8), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="torch", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.decode_args_f16 = self.artifacts_dir / "decode_args" + self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" + self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" self.iree_run_prefill_args = [ "--function=prefill_bs4", f"--input=@{self.prefill_args_f16}/tokens.npy", @@ -355,35 +381,38 @@ def setUp(self): "--benchmark_repetitions=3", ] - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException ) - def testBenchmark70B_f16_Decomposed(self): + def testBenchmark70B_f16_TP8_Decomposed(self): output_file_name = self.dir_path_70b / "f16_decomposed" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="decomposed", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + output_mlir = self.llama70b_f16_decomposed_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama70b_f16_decomposed_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama70b_f16_decomposed_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama70b_f16_decomposed_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=iree_compile_args, + json_path=output_json, + ) + self.llama70b_f16_decomposed_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -391,7 +420,7 @@ def testBenchmark70B_f16_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -399,35 +428,36 @@ def testBenchmark70B_f16_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError - ) - def testBenchmark70B_f16_Non_Decomposed(self): - output_file_name = self.dir_path_70b / "f16_torch_sdpa" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="torch_sdpa", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) + def testBenchmark70B_f16_TP8_Non_Decomposed(self): + output_file_name = self.dir_path_70b / "f16_torch" + output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=iree_compile_args, + json_path=output_json, + ) + self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -435,7 +465,7 @@ def testBenchmark70B_f16_Non_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -443,35 +473,38 @@ def testBenchmark70B_f16_Non_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Test not yet implemented", strict=True, raises=ExportMlirException ) - def testBenchmark70B_fp8_Decomposed(self): + def testBenchmark70B_fp8_TP8_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_decomposed" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="decomposed", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path_fp8, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama70b_fp8_decomposed_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama70b_fp8_decomposed_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"f8/tp8/llama3.1_70b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama70b_fp8_decomposed_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=self.iree_compile_args, + json_path=output_json, + ) + self.llama70b_fp8_decomposed_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, @@ -479,7 +512,7 @@ def testBenchmark70B_fp8_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, @@ -487,70 +520,104 @@ def testBenchmark70B_fp8_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Test not yet implemented", strict=True, raises=ExportMlirException ) - def testBenchmark70B_fp8_Non_Decomposed(self): - output_file_name = self.dir_path_70b / "fp8_torch_sdpa" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="torch_sdpa", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path_fp8, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + def testBenchmark70B_fp8_TP8_Non_Decomposed(self): + output_file_name = self.dir_path_70b / "fp8_torch" + output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama70b_fp8_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama70b_fp8_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"f8/tp8/llama3.1_70b_f8_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama70b_fp8_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=self.iree_compile_args, + json_path=output_json, + ) + self.llama70b_fp8_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, - args=self.iree_run_prefill_args_fp8, + args=self.iree_run_prefill_args, cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, - args=self.iree_run_decode_args_fp8, + args=self.iree_run_decode_args, cwd=self.repo_root, ) +@is_mi300x +@skipif_run_8b_llama class BenchmarkLlama3_1_405B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - artifacts_dir = Path("/data/llama-3.1/weights/405b") - self.irpa_path = artifacts_dir / "fp16/llama3.1_405b_fp16.irpa" - self.irpa_path_fp8 = artifacts_dir / "f8/llama405b_fp8.irpa" + self.artifacts_dir = Path("/data/llama-3.1/weights/405b") + self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa" + self.gguf_path = self.artifacts_dir / "fp16/llama3_405b_f16.gguf" + self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa" self.tensor_parallelism_size = 8 self.dir_path_405b = self.dir_path / "llama-405b" self.temp_dir_405b = Path(self.dir_path_405b) self.temp_dir_405b.mkdir(parents=True, exist_ok=True) - self.iree_compile_args = [ - "--iree-hal-target-backends=rocm", - f"--iree-hip-target={self.iree_hip_target}", - ] - self.prefill_args_f16 = artifacts_dir / "prefill_args" - self.decode_args_f16 = artifacts_dir / "decode_args" - self.prefill_args_fp8 = artifacts_dir / "prefill_args_fp8" - self.decode_args_fp8 = artifacts_dir / "decode_args_fp8" + self.llama405b_f16_decomposed_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="decomposed", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama405b_f16_torch_sdpa_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="torch", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama405b_fp8_decomposed_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path_fp8), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="decomposed", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.llama405b_fp8_torch_sdpa_artifacts = ExportArtifacts( + irpa_path=str(self.irpa_path_fp8), + batch_size=4, + iree_hip_target="gfx942", + iree_hal_target_backends="rocm", + attention_kernel="torch", + tensor_parallelism_size=self.tensor_parallelism_size, + ) + self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.decode_args_f16 = self.artifacts_dir / "decode_args" + self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" + self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" self.iree_run_prefill_args = [ "--function=prefill_bs4", f"--input=@{self.prefill_args_f16}/tokens.npy", @@ -586,35 +653,38 @@ def setUp(self): "--benchmark_repetitions=3", ] - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException ) - def testBenchmark405B_f16_Decomposed(self): + def testBenchmark405B_f16_TP8_Decomposed(self): output_file_name = self.dir_path_405b / "f16_decomposed" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="decomposed", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + output_mlir = self.llama405b_f16_decomposed_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama405b_f16_decomposed_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama405b_f16_decomposed_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama405b_f16_decomposed_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=iree_compile_args, + json_path=output_json, + ) + self.llama405b_f16_decomposed_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -622,7 +692,7 @@ def testBenchmark405B_f16_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -630,35 +700,36 @@ def testBenchmark405B_f16_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError - ) - def testBenchmark405B_f16_Non_Decomposed(self): - output_file_name = self.dir_path_405b / "f16_torch_sdpa" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="torch_sdpa", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) + def testBenchmark405B_f16_TP8_Non_Decomposed(self): + output_file_name = self.dir_path_405b / "f16_torch" + output_mlir = self.llama405b_f16_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama405b_f16_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=iree_compile_args, + json_path=output_json, + ) + self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -666,7 +737,7 @@ def testBenchmark405B_f16_Non_Decomposed(self): cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, @@ -674,91 +745,97 @@ def testBenchmark405B_f16_Non_Decomposed(self): cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Test not yet implemented", strict=True, raises=ExportMlirException ) - def testBenchmark405B_fp8_Decomposed(self): + def testBenchmark405B_fp8_TP8_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_decomposed" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="decomposed", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path_fp8, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + output_mlir = self.llama405b_fp8_decomposed_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama405b_fp8_decomposed_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama405b_fp8_decomposed_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama405b_fp8_decomposed_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=self.iree_compile_args, + json_path=output_json, + ) + self.llama405b_fp8_decomposed_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, - irpa_path=self.irpa_path, + irpa_path=self.irpa_path_fp8, args=self.iree_run_prefill_args, cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, - irpa_path=self.irpa_path, + irpa_path=self.irpa_path_fp8, args=self.iree_run_decode_args, cwd=self.repo_root, ) - @longrun - @is_mi300x @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=AttributeError + reason="Test not yet implemented", strict=True, raises=ExportMlirException ) - def testBenchmark405B_fp8_Non_Decomposed(self): - output_file_name = self.dir_path_405b / "fp8_torch_sdpa" - output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) - output_json = self.create_file(suffix=".json", prefix=output_file_name) - output_vmfb = self.create_file(suffix=".vmfb", prefix=output_file_name) - self.export_mlir( - attention_kernel="torch_sdpa", - tensor_parallelism_size=self.tensor_parallelism_size, - irpa_path=self.irpa_path_fp8, - output_mlir_path=output_mlir, - output_json_path=output_json, - cwd=self.repo_root, + def testBenchmark405B_fp8_TP8_Non_Decomposed(self): + output_file_name = self.dir_path_405b / "fp8_torch" + output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name ) - iree_compile_args = self.iree_compile_args + [ - f"--iree-hal-dump-executable-files-to={output_file_name}/files" - ] - self.iree_compile( + output_json = self.llama405b_fp8_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama405b_fp8_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + output_shard_file_name = ( + self.artifacts_dir + / f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa" + ) + if output_shard_file_name.exists(): + self.irpa_path = output_shard_file_name + export_return_code = self.llama405b_fp8_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, - output_vmfb_path=output_vmfb, - args=self.iree_compile_args, + json_path=output_json, + ) + self.llama405b_fp8_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, cwd=self.repo_root, ) # benchmark prefill - self.iree_benchmark_module( + self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, - args=self.iree_run_prefill_args_fp8, + args=self.iree_run_prefill_args, cwd=self.repo_root, ) # benchmark decode - self.iree_benchmark_module( + self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path_fp8, - args=self.iree_run_decode_args_fp8, + args=self.iree_run_decode_args, cwd=self.repo_root, )