[tuner] Add unified benchmarking and compilation for models + dispatc…

…hes (#704) This PR adds `benchmark()` and `compile()` functions to the tuner that can be used for both model and dispatch tuning. The new functions will replace the split benchmark/compile_models and benchmark/compile_dispatches functions. The new benchmarking and compilation functions now use the iree_runtime/iree_compiler python bindings which makes much of the code simpler. Particularly, benchmark results are now mostly parsed by the bindings, and the parse_*_benchmark_results functions are no longer needed. The new compilation and benchmarking flows are described below. ### Compilation ### 1. Populate each CandidateTracker with the input and output filepaths. The input filepaths can be overridden by an optional function argument to the compile() function. This argument can be used for model tuning, passing the model filepath as the new input file. 2. For each candidate, strip the compilation info using iree-opt, and compile to a vmfb with the iree compiler python bindings. Set the candidate's TD spec file (generated during candidate generation), and add any additional iree-compile flags that came from the TuningClient. The extra flags are taken from a new abstract TuningClient function called get_iree_compile_flags. 3. For all successful compilations, save the vmfbs to the designated output path, and skip any failed compilation. For any failed compilation, a failure dump is saved instead of the vmfb. 4. Remove duplicate vmfbs, and return the ids of all unique candidates. ### Benchmarking ### 1. Create benchmark task structs for each candidate with its CandidateTracker and the TuningClient 2. Run the candidate benchmarks on the available devices. Each benchmark task will benchmark the vmfb from the CandidateTracker using the iree_runtime python bindings, and return a benchmark result containing the candidate_id, benchmark time, and device_id. 3. Then the same benchmarking is done on the untuned baseline configuration once for each available device. 4. The results from the candidate benchmarks are compared with the baseline benchmarks from the same device, and the fastest candidates are logged and returned. The number of candidates returned is determined by an optional argument to the benchmark function, and all candidates will be returned by default. --------- Signed-off-by: Max Dawkins <[email protected]>
nod-ai · Dec 18, 2024 · aaee29a · aaee29a
1 parent c4a592a
commit aaee29a
Show file tree

Hide file tree

Showing 10 changed files with 646 additions and 28 deletions.
diff --git a/tuner/examples/dispatch/dispatch_tuner.py b/tuner/examples/dispatch/dispatch_tuner.py
@@ -79,6 +79,15 @@ def get_model_benchmark_command(
     ) -> list[str]:
         return []
 
+    def get_iree_compile_flags(self) -> list[str]:
+        return []
+
+    def get_iree_benchmark_module_flags(self) -> list[str]:
+        return []
+
+    def get_benchmark_timeout_s(self) -> int:
+        return 0
+
 
 def main():
     args = libtuner.parse_arguments()

diff --git a/tuner/examples/punet/punet_autotune.py b/tuner/examples/punet/punet_autotune.py
@@ -113,6 +113,15 @@ def get_model_benchmark_command(
         ]
         return command
 
+    def get_iree_compile_flags(self) -> list[str]:
+        return []
+
+    def get_iree_benchmark_module_flags(self) -> list[str]:
+        return []
+
+    def get_benchmark_timeout_s(self) -> int:
+        return 0
+
 
 def main():
     args = libtuner.parse_arguments()

diff --git a/tuner/examples/test/README.md b/tuner/examples/test/README.md
@@ -0,0 +1,39 @@
+# Example Tuner Test
+
+Example of tuning a dispatch and full model.
+
+## Environments
+Follow instructions in [`/tuner/README.md`](../README.md)
+
+## Running the Tuner
+
+### Choose a model to tune
+This example uses the simple `double_mmt.mlir` file.
+
+### Generate a benchmark file
+Use the usual `iree-compile` command for your model, add
+`--iree-hal-dump-executable-files-to=dump --iree-config-add-tuner-attributes`,
+and get the dispatch benchmark that you want to tune. For example:
+```shell
+iree-compile double_mmt.mlir --iree-hal-target-backends=rocm \
+    --iree-hip-target=gfx942 --iree-hal-dump-executable-files-to=dump \
+    --iree-config-add-tuner-attributes -o /dev/null
+
+cp dump/module_main_dispatch_0_rocm_hsaco_fb_benchmark.mlir mmt_benchmark.mlir
+```
+
+### Recommended Trial Run
+For an initial trial to test the tuning loop, use:
+```shell
+python -m examples.test double_mmt.mlir mmt_benchmark.mlir \
+    --test_num_dispatch_candidates=5 --test_num_model_candidates=3 \
+    --num-candidates=30
+```
+
+### Basic Usage
+```shell
+python -m examples.test <model_file_path> <benchmark_file_path> \
+    --test_num_dispatch_candidates=<num_dispatch_candidates> \
+    --test_num_model_candidates=<num_model_candidates> \
+    --test_hip_target=<hip_target> \ --num-candidates=<num_generated_candidates>
+```
diff --git a/tuner/examples/test/double_mmt.mlir b/tuner/examples/test/double_mmt.mlir
@@ -0,0 +1,16 @@
+!matA_0 = tensor<2048x2048xf16>
+!matB_0 = tensor<2048x2048xf16>
+!matC_0 = tensor<2048x2048xf32>
+
+!matC_1 = tensor<2048x2048xf32>
+
+func.func @main(%arg0: !matA_0, %arg1: !matB_0) -> !matC_1 {
+  %cst = arith.constant 0.000000e+00 : f32
+  %5 = tensor.empty() : !matC_0
+  %6 = linalg.fill ins(%cst : f32) outs(%5 : !matC_0) -> !matC_0
+  %7 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
+  %8 = tensor.empty() : !matC_1
+  %9 = linalg.fill ins(%cst : f32) outs(%8 : !matC_1) -> !matC_1
+  %10 = linalg.matmul_transpose_b ins(%7, %7 : !matC_0, !matC_0) outs(%9 : !matC_1) -> !matC_1
+  return %10 : !matC_1
+}
diff --git a/tuner/examples/test/tuner_test.py b/tuner/examples/test/tuner_test.py
@@ -4,35 +4,164 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import argparse
+from pathlib import Path
 from tuner import libtuner
 
 
+class TestTuner(libtuner.TuningClient):
+    def __init__(self):
+        super().__init__()
+        self.compile_flags = ["--compile-from=executable-sources"]
+        self.benchmark_flags = ["--benchmark_repetitions=3", "--input=1"]
+
+    def get_iree_compile_flags(self) -> list[str]:
+        return self.compile_flags
+
+    def get_iree_benchmark_module_flags(self) -> list[str]:
+        return self.benchmark_flags
+
+    def get_benchmark_timeout_s(self) -> int:
+        return 10
+
+    # TODO(Max191): Remove the following unused abstract functions once they
+    # are removed from the TuningClient definition.
+    def get_dispatch_benchmark_timeout_s(self) -> int:
+        return 0
+
+    def get_dispatch_compile_timeout_s(self) -> int:
+        return 0
+
+    def get_dispatch_compile_command(
+        self, candidate_tracker: libtuner.CandidateTracker
+    ) -> list[str]:
+        return []
+
+    def get_dispatch_benchmark_command(
+        self,
+        candidate_tracker: libtuner.CandidateTracker,
+    ) -> list[str]:
+        return []
+
+    def get_model_compile_timeout_s(self) -> int:
+        return 0
+
+    def get_model_compile_command(
+        self, candidate_tracker: libtuner.CandidateTracker
+    ) -> list[str]:
+        return []
+
+    def get_model_benchmark_timeout_s(self) -> int:
+        return 0
+
+    def get_model_benchmark_command(
+        self, candidate_tracker: libtuner.CandidateTracker
+    ) -> list[str]:
+        return []
+
+
 def main():
-    args = libtuner.parse_arguments()
+    # Custom arguments for the test file.
+    parser = argparse.ArgumentParser(description="Autotune test script")
+    test_args = parser.add_argument_group("Example Test Options")
+    test_args.add_argument(
+        "test_model_file", type=Path, help="Path to the model file to tune (.mlir)"
+    )
+    test_args.add_argument(
+        "--test_num_dispatch_candidates",
+        type=int,
+        default=None,
+        help="Number of dispatch candidates to keep for model benchmarks.",
+    )
+    test_args.add_argument(
+        "--test_num_model_candidates",
+        type=int,
+        default=None,
+        help="Number of model candidates to produce after tuning.",
+    )
+    test_args.add_argument(
+        "--test_hip_target",
+        type=str,
+        default="gfx942",
+        help="Hip target for tuning.",
+    )
+    # Remaining arguments come from libtuner
+    args = libtuner.parse_arguments(parser)
 
     path_config = libtuner.PathConfig()
     path_config.base_dir.mkdir(parents=True, exist_ok=True)
     path_config.output_unilog.touch()
+    # TODO(Max191): Make candidate_trackers internal to TuningClient.
     candidate_trackers: list[libtuner.CandidateTracker] = []
     stop_after_phase: str = args.stop_after
 
     print("Setup logging")
     libtuner.setup_logging(args, path_config)
     print(path_config.run_log, end="\n\n")
 
+    # TODO(Max191): Some bug seems to be causing OOM errors in benchmarking
+    # when device validation happens, so this is commented for now. Uncomment
+    # when the bug is fixed.
     if not args.dry_run:
         print("Validating devices")
         libtuner.validate_devices(args.devices)
         print("Validation successful!\n")
 
     print("Generating candidates...")
+    test_tuner = TestTuner()
     candidates = libtuner.generate_candidate_specs(
-        args, path_config, candidate_trackers
+        args, path_config, candidate_trackers, test_tuner
     )
     print(f"Stored candidate specs in {path_config.specs_dir}\n")
     if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
         return
 
+    print("Compiling candidates...")
+    compiled_candidates = libtuner.compile(
+        args, path_config, candidates, candidate_trackers, test_tuner
+    )
+
+    print("Benchmarking compiled candidates...")
+    top_candidates = libtuner.benchmark(
+        args,
+        path_config,
+        compiled_candidates,
+        candidate_trackers,
+        test_tuner,
+        args.test_num_dispatch_candidates,
+    )
+
+    print("Compiling models with top candidates...")
+    test_tuner.compile_flags = [
+        "--iree-hal-target-backends=rocm",
+        f"--iree-hip-target={args.test_hip_target}",
+    ]
+    compiled_model_candidates = libtuner.compile(
+        args,
+        path_config,
+        top_candidates,
+        candidate_trackers,
+        test_tuner,
+        args.test_model_file,
+    )
+
+    print("Benchmarking compiled model candidates...")
+    test_tuner.benchmark_flags = [
+        "--benchmark_repetitions=3",
+        "--input=2048x2048xf16",
+        "--input=2048x2048xf16",
+    ]
+    top_model_candidates = libtuner.benchmark(
+        args,
+        path_config,
+        compiled_model_candidates,
+        candidate_trackers,
+        test_tuner,
+        args.test_num_model_candidates,
+    )
+
+    print(f"Top model candidates: {top_model_candidates}")
+
     print("Check the detailed execution logs in:")
     print(path_config.run_log.resolve())
 

diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
@@ -118,14 +118,18 @@ def get_td_spec(
 
 
 class DispatchTunerRegistry:
-    def __init__(self):
+    def __init__(self, check_translation_info=True):
+        self.check_translation_info = check_translation_info
         self.registry = set()
 
     def register(self, dispatch_tuners: list[DispatchTuner]) -> None:
         for dispatch_tuner in dispatch_tuners:
             self.registry.add(dispatch_tuner)
 
+    # TODO(Max191): Remove translation info validation.
     def validate_translation(self, attrs: list[ir.NamedAttribute]) -> bool:
+        if not self.check_translation_info:
+            return True
         for attr in attrs:
             if (attr.name == "translation_info") and (
                 "LLVMGPUVectorDistribute" in str(attr.attr)
@@ -641,7 +645,7 @@ def generate_configs_and_td_specs(
     limit: int = 4096,  # Max candidates to be generated
     num_subgroups: int = 4,  # GPU spec, used to determine candidate generation constraints
 ) -> list[ir.Module]:
-    dispatch_tuner_registry = DispatchTunerRegistry()
+    dispatch_tuner_registry = DispatchTunerRegistry(check_translation_info=False)
     dispatch_tuner_registry.register(
         [
             ContractionOpInterfaceTuner(),
@@ -658,10 +662,8 @@ def generate_configs_and_td_specs(
     )
     tune_logger.debug(str(problem_size))
 
-    # Index 0 is reserved for default config, so it gets no td spec.
-    with ir.Location.unknown() as loc:
-        empty_module = ir.Module.create(loc)
-    config_specs: list[ir.Module] = [empty_module]
+    # Index 0 is reserved for default config, so it gets a placeholder spec.
+    config_specs: list[ir.Module] = [get_placeholder_spec(input_module.context)]
 
     # Get the MMA intrinisic intructions supported by the target.
     variant_op_list = iree_codegen.get_executable_variant_ops(input_module)

diff --git a/tuner/tuner/dispatch_parser.py b/tuner/tuner/dispatch_parser.py
@@ -110,8 +110,7 @@ def get_contraction_operation(
     # TODO(Max191): Pass the ir_module directly instead of the template str.
     def get_shapes(self, template: list[str]) -> ProblemSize:
         matcher = ContractionOpInterfaceMatcher()
-        with ir.Context() as ctx:
-            ir_module = ir.Module.parse("\n".join(template), ctx)
+        ir_module = ir.Module.parse("\n".join(template))
         contraction_op = match_root_op(ir_module, matcher)
         assert contraction_op is not None, f"contraction op not found"
         cdims = matcher.contraction_dimensions
@@ -161,8 +160,7 @@ def get_conv_operation(
 
     # TODO(Max191): Pass the ir_module directly instead of the template str.
     def get_shapes(self, template: list[str]) -> ProblemSize:
-        with ir.Context() as ctx:
-            ir_module = ir.Module.parse("\n".join(template), ctx)
+        ir_module = ir.Module.parse("\n".join(template))
         conv_op = match_root_op(ir_module, NamedOpMatcher(self.supported_ops))
         assert conv_op is not None, f"convolution op not found"
         lhs_type = ir.RankedTensorType(conv_op.operands[0].type)