diff --git a/build_tools/ci/cpu_comparison/performance_summarizer.py b/build_tools/ci/cpu_comparison/performance_summarizer.py index 146be4aa7..401b06401 100644 --- a/build_tools/ci/cpu_comparison/performance_summarizer.py +++ b/build_tools/ci/cpu_comparison/performance_summarizer.py @@ -6,15 +6,22 @@ import sys if len(sys.argv) != 2: - print("Usage: python3 performance_summarizer.py ") + print( + "Usage: python3 performance_summarizer.py . This will strip out the performance numbers from the log file and print a summary." + ) sys.exit(1) path = sys.argv[1] with open(path, "r") as f: lines = f.readlines() print("============================") + first_print = True for line in lines: if "Run #1" in line: - print(line.split()[-1]) + if not first_print: + print("\n" + line.split()[-1]) + else: + print(line.split()[-1]) + first_print = False if "IREE_AMDAIE" in line: - print(line) + print(line.strip()) print("============================") diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 755464245..79a73f101 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -25,9 +25,10 @@ ) -def run_conv_test(config, filename, n_repeats): +def run_conv_test(config, aie_compilation_flags, filename, n_repeats): aie_vs_llvm_cpu( config, + aie_compilation_flags, filename, tile_pipeline="conv-decompose", lower_to_aie_pipeline="objectFifo", @@ -37,35 +38,58 @@ def run_conv_test(config, filename, n_repeats): return True -class BaseTemplate(ABC): +class BaseTest(ABC): """ - Base class to be inherited by any new dispatch/compute op template. - The derived instances would therefore be created specifying the intended target - device(s) they're to be run; and would accordingly be `run` if the intended - target device(s) contains the `target_device` supplied via command-line. + Base class to be inherited by all tests. The derived instances will be + created specifying the intended target device(s) they're to be run on; and + will accordingly be `run` if the intended target device(s) contains + the `target_device` found in `config`. The default set of targets to run on + is the singleton set `["npu1_4col"]`. + + An instance of this class has a member `aie_compilation_flags` + which are additional flags to be passed to the AIE backend compiler. + + Compilation flags can therefore be injected into tests in 2 ways: + 1) via the constructor of this base class + 2) via the `add_aie_compilation_flags` method """ - def __init__(self, run_on_target, additional_aie_compilation_flags=None): - self.run_on_target = run_on_target - self.additional_aie_compilation_flags = additional_aie_compilation_flags \ - if additional_aie_compilation_flags is not None else [] + def __init__(self, run_on_target=["npu1_4col"], aie_compilation_flags=None): + self.run_on_target = [] if run_on_target is None else run_on_target + self.aie_compilation_flags = ( + [] if aie_compilation_flags is None else aie_compilation_flags + ) + assert isinstance(self.aie_compilation_flags, list) + assert all(isinstance(flag, str) for flag in self.aie_compilation_flags) + + # NB: derived classes should add labels to this list in their + # constructor, never overwrite it. + self.labels = ["All"] + + def add_aie_compilation_flags(self, flags): + if flags: + if isinstance(flags, str): + flags = flags.split() + assert isinstance(flags, list) + assert all(isinstance(flag, str) for flag in flags) + + self.aie_compilation_flags += flags + # unique-ify the list + self.aie_compilation_flags = list(set(self.aie_compilation_flags)) def run(self, config): - for flag in self.additional_aie_compilation_flags: - config.add_additional_aie_compilation_flag(flag) if config.target_device in self.run_on_target: return self._execute(config) - # Return False to indicate that the test did not run. return False @abstractmethod def _execute(self, config): - pass + raise RuntimeError("Derived class must implement this method") -class ConvolutionFromTemplate(BaseTemplate): - def __init__(self, params, run_on_target=["npu1_4col"]): - super().__init__(run_on_target) +class ConvolutionFromTemplate(BaseTest): + def __init__(self, params): + super().__init__() self.generator = ConvolutionMlirGenerator(**params) params = self.generator.params conv_type = params["conv_type"] @@ -75,7 +99,7 @@ def __init__(self, params, run_on_target=["npu1_4col"]): out_type = params["output_element_type"] # TODO(newling) Use all parameters in name, to avoid name collision. self.name = f"{conv_type}_{N}_{IW}_{in_type}_{out_type}" - self.labels = ["Convolution"] + self.labels += ["Convolution"] def _execute(self, config): # Generate MLIR file: @@ -83,73 +107,91 @@ def _execute(self, config): filename = output_dir / f"{self.name}.mlir" self.generator.write_to_file(filename) # Perform numerical comparison between AIE and CPU: - return run_conv_test(config, filename, n_repeats=2) + return run_conv_test(config, self.aie_compilation_flags, filename, n_repeats=2) -class ConvolutionNHWCQ(BaseTemplate): - def __init__(self, run_on_target=["npu1_4col"]): - super().__init__(run_on_target) +class ConvolutionNHWCQ(BaseTest): + def __init__(self): + super().__init__() self.name = "convolution_nhwc_q" - self.labels = ["Convolution", "ConvolutionNHWCQ"] + self.labels += ["Convolution", "ConvolutionNHWCQ"] def _execute(self, config): files_dir = config.file_dir / "test_files" filename = files_dir / "conv2d_nhwc_q.mlir" - return run_conv_test(config, filename, n_repeats=1) + return run_conv_test(config, self.aie_compilation_flags, filename, n_repeats=1) -class MultipleDispatches(BaseTemplate): - def __init__(self, name, run_on_target=["npu1_4col"]): - super().__init__(run_on_target) +class MultipleDispatches(BaseTest): + def __init__(self, name): + super().__init__() self.name = name - self.labels = ["Matmul", "MultipleDispatches"] + self.labels += ["Matmul", "MultipleDispatches"] def _execute(self, config): test_files_dir = config.file_dir / "test_files" self.filename = test_files_dir / f"{self.name}.mlir" + # TODO(newling) did Maks ever document why this is here, if so add an + # explainer. if config.xdna_datetime and config.xdna_datetime < 20240801: - aie_vs_llvm_cpu(config, self.filename, function_name="three_$mm$") + aie_vs_llvm_cpu( + config, + self.aie_compilation_flags, + self.filename, + function_name="three_$mm$", + ) return True else: # Return False to indicate that the test did not run. return False -class BaseMatmul(BaseTemplate): +class BaseMatmul(BaseTest): def __init__( self, + run_on_target, + aie_compilation_flags, M, N, - K, + K, input_type, acc_type, - run_on_target=["npu1_4col"], - additional_compilation_flags=None + use_ukernel=False, + lower_to_aie_pipeline="objectFifo", + tile_pipeline="pack-peel", + n_repeats=1, ): - super().__init__(run_on_target, additional_compilation_flags) - self.labels = [] + super().__init__(run_on_target, aie_compilation_flags) self.M = M self.N = N self.K = K self.input_type = input_type self.acc_type = acc_type + self.tile_pipeline = tile_pipeline + self.lower_to_aie_pipeline = lower_to_aie_pipeline + self.use_ukernel = use_ukernel + self.n_repeats = n_repeats self.labels.append("Matmul") + if use_ukernel: + self.labels.append("UKernel") + def vs_cpu(self, config, filename): + if self.use_ukernel and not config.vitis_dir: + return False -class MatmulFullBias(BaseMatmul): - """ - A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN - """ + aie_vs_llvm_cpu( + config=config, + aie_compilation_flags=self.aie_compilation_flags, + test_file=filename, + use_ukernel=self.use_ukernel, + tile_pipeline=self.tile_pipeline, + lower_to_aie_pipeline=self.lower_to_aie_pipeline, + n_repeats=self.n_repeats, + ) - def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]): - super().__init__(M, N, K, input_type, acc_type, run_on_target) - self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}" - self.labels.append("MatmulFullBias") + return True - def _execute(self, config): - filename = config.output_dir / f"{self.name}.mlir" - matmul_template_dir = config.file_dir / "matmul_template" - template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir" + def generate(self, filename, template_name): generate_matmul_test( filename, template_name, @@ -159,14 +201,33 @@ def _execute(self, config): self.input_type, self.acc_type, ) - aie_vs_llvm_cpu( - config, - filename, - tile_pipeline="pack-peel", - # TODO(someone) This should work for "objectFifo". + + +class MatmulFullBias(BaseMatmul): + """ + A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN + """ + + def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]): + super().__init__( + run_on_target=run_on_target, + aie_compilation_flags=None, + M=M, + N=N, + K=K, + input_type=input_type, + acc_type=acc_type, lower_to_aie_pipeline="air", ) + self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}" + self.labels.append("MatmulFullBias") + def _execute(self, config): + filename = config.output_dir / f"{self.name}.mlir" + matmul_template_dir = config.file_dir / "matmul_template" + template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir" + self.generate(filename, template_name) + self.vs_cpu(config, filename) return True @@ -177,39 +238,47 @@ class VanillaMatmul(BaseMatmul): def __init__( self, - name, M, N, K, input_type, acc_type, - use_ukernel, + name_suffix="", + use_ukernel=False, run_on_target=["npu1_4col"], - additional_labels=[], - additional_aie_compilation_flags=None, + additional_labels=None, + aie_compilation_flags=None, + n_repeats=1, ): - super().__init__(M, N, K, input_type, acc_type, run_on_target, - additional_aie_compilation_flags) - self.name = f"vanilla_matmul_{name}_{M}_{N}_{K}_{input_type}_{acc_type}" + super().__init__( + run_on_target=run_on_target, + aie_compilation_flags=aie_compilation_flags, + M=M, + N=N, + K=K, + input_type=input_type, + acc_type=acc_type, + tile_pipeline="pack-peel", + use_ukernel=use_ukernel, + n_repeats=n_repeats, + ) + + self.name = f"vanilla_matmul_{M}_{N}_{K}_{input_type}_{acc_type}" + if name_suffix: + self.name += f"_{name_suffix}" + if use_ukernel: + self.name += "_ukernel" self.labels.append("VanillaMatmul") - self.labels += additional_labels + if additional_labels: + self.labels += additional_labels self.use_ukernel = use_ukernel def _execute(self, config): self.filename = config.output_dir / f"{self.name}.mlir" matmul_template_dir = config.file_dir / "matmul_template" template_name = matmul_template_dir / "matmul_MxK_KxN.mlir" - generate_matmul_test( - self.filename, - template_name, - self.M, - self.N, - self.K, - self.input_type, - self.acc_type, - ) - - aie_vs_llvm_cpu(config, self.filename, use_ukernel=self.use_ukernel) + self.generate(self.filename, template_name) + self.vs_cpu(config, self.filename) return True @@ -220,43 +289,38 @@ class MatmulThinBias(BaseMatmul): """ def __init__( - self, M, N, K, input_type, acc_type, use_ukernel, run_on_target=["npu1_4col"] + self, + M, + N, + K, + input_type, + acc_type, + use_ukernel=False, + run_on_target=["npu1_4col"], ): - super().__init__(M, N, K, input_type, acc_type, run_on_target) - tail = "" if use_ukernel else "ukernel" - self.name = f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}_{tail}" - self.labels.append("MatmulThinBias") + super().__init__( + run_on_target=run_on_target, + aie_compilation_flags=None, + M=M, + N=N, + K=K, + input_type=input_type, + acc_type=acc_type, + lower_to_aie_pipeline="air", + use_ukernel=use_ukernel, + ) + + self.name = f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}" if use_ukernel: - self.labels.append("UKernel") - self.use_ukernel = use_ukernel + self.name += "_ukernel" + self.labels.append("MatmulThinBias") def _execute(self, config): self.filename = config.output_dir / f"{self.name}.mlir" matmul_template_dir = config.file_dir / "matmul_template" template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir" - generate_matmul_test( - self.filename, - template_name, - self.M, - self.K, - self.N, - self.input_type, - self.acc_type, - ) - - if self.use_ukernel and not config.vitis_dir: - return False - - else: - aie_vs_llvm_cpu( - config, - self.filename, - tile_pipeline="pack-peel", - # TODO(someone) This should work for "objectFifo". - lower_to_aie_pipeline="air", - use_ukernel=self.use_ukernel, - ) - return True + self.generate(self.filename, template_name) + return self.vs_cpu(config, self.filename) class BatchMatmul(BaseMatmul): @@ -265,7 +329,17 @@ class BatchMatmul(BaseMatmul): """ def __init__(self, B, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]): - super().__init__(M, N, K, input_type, acc_type, run_on_target) + super().__init__( + run_on_target=run_on_target, + aie_compilation_flags=None, + M=M, + N=N, + K=K, + input_type=input_type, + acc_type=acc_type, + tile_pipeline="pack-peel", + n_repeats=1, + ) self.name = f"batch_matmul_{B}_{M}_{N}_{K}_{input_type}_{acc_type}" self.labels.append("BatchMatmul") @@ -285,12 +359,7 @@ def _execute(self, config): lhs_rhs_type=self.input_type, acc_type=self.acc_type, ) - aie_vs_llvm_cpu( - config, - self.filename, - ) - - return True + return self.vs_cpu(config, self.filename) class MatmulTruncf(BaseMatmul): @@ -309,56 +378,65 @@ def __init__( expected_out, run_on_target=["npu1_4col"], ): - super().__init__(M, M, K, input_type, acc_type, run_on_target) - self.name = f"matmul_truncf_{M}_{K}_{input_type}_{acc_type}" - self.labels.append("MatmulTruncf") - self.lhs = lhs - self.rhs = rhs - self.expected_out = expected_out + super().__init__( + run_on_target=run_on_target, + aie_compilation_flags=None, + M=M, + N=M, + K=K, + input_type=input_type, + acc_type=acc_type, + tile_pipeline="pack-peel", + n_repeats=1, + ) # Assertions on shapes: Check that lhs is MxK, rhs is KxM, and expected_out is MxM assert lhs.shape == (M, K) assert rhs.shape == (K, M) assert expected_out.shape == (M, M) + self.name = f"matmul_truncf_{M}_{K}_{input_type}_{acc_type}" + self.labels.append("MatmulTruncf") + self.lhs = lhs + self.rhs = rhs + self.expected_out = expected_out + def _execute(self, config): self.filename = config.output_dir / f"{self.name}.mlir" matmul_template_dir = config.file_dir / "matmul_template" template_name = matmul_template_dir / "matmul_truncf_MxK_KxN.mlir" - generate_matmul_test( - self.filename, - template_name, - self.M, - self.N, - self.K, - self.input_type, - self.acc_type, - ) + self.generate(self.filename, template_name) input_args = generate_inputs( self.filename, config.output_dir, 1, {1: self.lhs, 2: self.rhs} ) """ currently without enabling loop coalescing and unit dimension collapsing - we run out of program memory, this is under investigation. + we run out of program memory, this is under investigation. We also + enable function outlining. """ + self.add_aie_compilation_flags( + [ + "--iree-amdaie-enable-coalescing-loops", + "--iree-amdaie-enable-collapsing-unit-dims", + "--iree-amdaie-enable-function-outlining", + ] + ) aie_vs_baseline( - config, - self.filename, - input_args, - self.expected_out, - use_ukernel=False, - tile_pipeline="pack-peel", - lower_to_aie_pipeline="objectFifo", + config=config, + aie_compilation_flags=self.aie_compilation_flags, + test_file=self.filename, + input_args=input_args, + baseline_value=self.expected_out, + use_ukernel=self.use_ukernel, + tile_pipeline=self.tile_pipeline, function_name=None, seed=1, rtol=0, atol=0, - n_repeats=1, + lower_to_aie_pipeline=self.lower_to_aie_pipeline, + n_repeats=self.n_repeats, output_type=get_output_type(self.filename), - coalesce_loops=True, - collapse_unit_dims=True, - function_outline=True, ) return True @@ -439,6 +517,7 @@ def shell_out(cmd: list, workdir=None, verbose: int = 0, raise_on_error=True, en def generate_aie_vmfb( config, + aie_compilation_flags, name, tile_pipeline, lower_to_aie_pipeline, @@ -446,18 +525,15 @@ def generate_aie_vmfb( test_file, input_args, function_name, - coalesce_loops=False, - collapse_unit_dims=False, - function_outline=False, ): """ Compile a test file for IREE's AIE backend, returning the path to the compiled module. """ - additional_flags = config.additional_aie_compilation_flags.split() + additional_flags = aie_compilation_flags - compilation_flags = [ + aie_compilation_flags = [ config.iree_compile_exe, test_file, "--iree-hal-target-backends=amd-aie", @@ -479,31 +555,22 @@ def generate_aie_vmfb( ] if config.verbose: - compilation_flags += ["--iree-amd-aie-show-invoked-commands"] + aie_compilation_flags += ["--iree-amd-aie-show-invoked-commands"] if use_ukernel: - compilation_flags += ["--iree-amdaie-enable-ukernels=all"] - - if coalesce_loops: - compilation_flags += ["--iree-amdaie-enable-coalescing-loops"] - - if collapse_unit_dims: - compilation_flags += ["--iree-amdaie-enable-collapsing-unit-dims"] - - if function_outline: - compilation_flags += ["--iree-amdaie-enable-function-outlining"] + aie_compilation_flags += ["--iree-amdaie-enable-ukernels=all"] for additional_flag in additional_flags: - if additional_flag not in compilation_flags: - compilation_flags += [additional_flag] + if additional_flag not in aie_compilation_flags: + aie_compilation_flags += [additional_flag] - compilation_flags += [ + aie_compilation_flags += [ "-o", config.output_dir / f"{name}_aie.vmfb", ] start = time.monotonic_ns() - shell_out(compilation_flags, config.output_dir, config.verbose) + shell_out(aie_compilation_flags, config.output_dir, config.verbose) compile_time = time.monotonic_ns() - start if config.verbose: print(f"Time spent in compilation: {compile_time // 1e6} [ms]") @@ -562,7 +629,7 @@ def generate_llvm_cpu_output( """ cpu_vmfb = config.output_dir / f"{name}_cpu.vmfb" - compilation_flags = [ + aie_compilation_flags = [ config.iree_compile_exe, test_file, "--iree-hal-target-backends=llvm-cpu", @@ -570,7 +637,7 @@ def generate_llvm_cpu_output( "-o", f"{cpu_vmfb}", ] - shell_out(compilation_flags, workdir=config.output_dir, verbose=config.verbose) + shell_out(aie_compilation_flags, workdir=config.output_dir, verbose=config.verbose) cpu_bin = config.output_dir / f"{name}_cpu.bin" run_args = [ @@ -587,8 +654,7 @@ def generate_llvm_cpu_output( class TestConfig: """ - Global state used for all tests. Stores paths to executables used, and - records test failures. + Global state used for all tests. Stores paths to executables used. """ def __init__( @@ -602,10 +668,8 @@ def __init__( iree_compile_exe, iree_run_exe, verbose, - return_on_fail, reset_npu_between_runs, do_not_run_aie, - additional_aie_compilation_flags, device_hal, xrt_lite_n_core_rows, xrt_lite_n_core_cols, @@ -620,13 +684,11 @@ def __init__( self.file_dir = file_dir self.iree_compile_exe = iree_compile_exe self.iree_run_exe = iree_run_exe - self.return_on_fail = return_on_fail self.verbose = verbose self.xdna_datetime = None self.xdna_hash = None self.reset_npu_between_runs = reset_npu_between_runs self.do_not_run_aie = do_not_run_aie - self.additional_aie_compilation_flags = additional_aie_compilation_flags self.device_hal = device_hal self.xrt_lite_n_core_rows = xrt_lite_n_core_rows self.xrt_lite_n_core_cols = xrt_lite_n_core_cols @@ -647,9 +709,6 @@ def __init__( f"The file {self.reset_npu_script} does not exist, and reset_npu_script=True" ) - # Populated at runtime - self.failures = [] - if not isinstance(self.verbose, bool) and not isinstance(self.verbose, int): raise ValueError( f"verbose must be a boolean or integer, not {type(verbose)}" @@ -727,9 +786,6 @@ def __init__( if peano_commit_hash: self.peano_commit_hash = peano_commit_hash[0] - def add_additional_aie_compilation_flag(self, flag): - self.additional_aie_compilation_flags += flag - def __str__(self): return dedent( f""" @@ -746,7 +802,6 @@ def __str__(self): peano_commit_hash: {self.peano_commit_hash} peano_dir: {self.peano_dir} reset_npu_script: {self.reset_npu_script} - return_on_fail: {self.return_on_fail} use_chess: {self.use_chess} verbose: {self.verbose} vitis_dir: {self.vitis_dir} @@ -798,6 +853,7 @@ def name_from_mlir_filename(mlir_filename): def aie_vs_baseline( config, + aie_compilation_flags, test_file, input_args, baseline_value, @@ -810,13 +866,10 @@ def aie_vs_baseline( atol, n_repeats, output_type, - coalesce_loops=False, collapse_unit_dims=False, function_outline=False, ): """ - If the outputs differ, add the test file to a list of failures. - Arguments to the function are: config: TestConfig containing any state which is common to all tests @@ -839,8 +892,6 @@ def aie_vs_baseline( n_repeats: The number of times to run the test. This is useful for tests which may pass only sometimes due to driver issues, etc. - coalesce_loops: - Whether to enable coalescing of loops when compiling for AIE backend collapse_unit_dims: Whether to enable collapsing of unit dimensions when compiling for AIE backend function_outline: @@ -851,6 +902,7 @@ def aie_vs_baseline( aie_vmfb = generate_aie_vmfb( config, + aie_compilation_flags, name, tile_pipeline, lower_to_aie_pipeline, @@ -858,9 +910,6 @@ def aie_vs_baseline( test_file, input_args, function_name, - coalesce_loops, - collapse_unit_dims, - function_outline, ) if config.do_not_run_aie: @@ -884,13 +933,12 @@ def aie_vs_baseline( summary_string = compare(baseline_value, aie_output, rtol, atol) if summary_string: print(summary_string) - config.failures.append(test_file) - if config.return_on_fail: - raise RuntimeError("Test failed, exiting.") + raise RuntimeError("Test failed, exiting.") def aie_vs_llvm_cpu( config, + aie_compilation_flags, test_file, use_ukernel=False, tile_pipeline="pack-peel", @@ -927,6 +975,7 @@ def aie_vs_llvm_cpu( aie_vs_baseline( config, + aie_compilation_flags, test_file, input_args, cpu_output, @@ -944,6 +993,10 @@ def aie_vs_llvm_cpu( class Tests: + def add_aie_compilation_flags(self, flags): + for test in self.tests: + test.add_aie_compilation_flags(flags) + def register(self, test): self.tests.append(test) if test.name in self.existing_names: @@ -1004,58 +1057,71 @@ def __init__(self): self.register(BatchMatmul(2, 64, 64, 64, input_type, acc_type)) # MatmulThinBias test(s): - self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", True)) - self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", False)) + self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", use_ukernel=True)) + self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32")) # VanillaMatmul test(s): self.register( VanillaMatmul( - "scalar_i32", 32, 32, 32, "i32", "i32", - use_ukernel=False, run_on_target=["npu1_4col", "npu4"], ) ) self.register( VanillaMatmul( - "infinite_loop", 32, 32, 32, "i32", "i32", - use_ukernel=False, + name_suffix="infinite_loop", run_on_target=["npu1_4col", "npu4"], - additional_aie_compilation_flags=["--iree-amdaie-enable-infinite-loop-around-core-block=true"] + aie_compilation_flags=[ + "--iree-amdaie-enable-infinite-loop-around-core-block=true" + ], ) ) - self.register(VanillaMatmul("bfloat", 32, 32, 64, "bf16", "f32", use_ukernel=False)) - + self.register(VanillaMatmul(32, 32, 64, "bf16", "f32")) # TODO: Failure is expected for the 128x128 case we don't yet understand why. self.register( VanillaMatmul( - "bfloat_ukernel", 64, 64, 64, "bf16", "f32", use_ukernel=True, run_on_target=["npu4"] - ) - ) - - self.register( - VanillaMatmul( - "bfloat_perf", - 512, - 512, - 4096, + 64, + 64, + 64, "bf16", "f32", - use_ukernel=False, - additional_labels=["Performance"], + use_ukernel=True, + run_on_target=["npu4"], ) ) + # Some bf16 Performance tests: + for M, N, K, use_ukernel in [ + (512, 512, 4096, False), + (512, 512, 4096, True), + (512, 4096, 512, False), + (512, 4096, 512, True), + (4096, 512, 512, False), + (4096, 512, 512, True), + ]: + self.register( + VanillaMatmul( + M, + N, + K, + "bf16", + "f32", + additional_labels=["Performance"], + use_ukernel=use_ukernel, + n_repeats=2, + ) + ) + # MultipleDispatches tests: for name in ["two_matmul_switching", "matmul_f32_8_8_4", "matmul_f32_8_4_8"]: self.register(MultipleDispatches(name)) @@ -1102,12 +1168,10 @@ def all_tests( peano_dir, xrt_dir, vitis_dir, - return_on_fail, verbose, reset_npu_between_runs, do_not_run_aie, test_set, - additional_aie_compilation_flags, device_hal, xrt_lite_n_core_rows, xrt_lite_n_core_cols, @@ -1138,7 +1202,7 @@ def all_tests( raise RuntimeError(f"'{iree_install_dir}' is not a directory.") iree_compile_exe = find_executable(iree_install_dir, "iree-compile") iree_run_exe = find_executable(iree_install_dir, "iree-run-module") - file_dir = Path(__file__).parent + file_dir = Path(os.path.dirname(os.path.abspath(__file__))) config = TestConfig( output_dir, @@ -1150,10 +1214,8 @@ def all_tests( iree_compile_exe, iree_run_exe, verbose, - return_on_fail, reset_npu_between_runs, do_not_run_aie, - additional_aie_compilation_flags, device_hal, xrt_lite_n_core_rows, xrt_lite_n_core_cols, @@ -1180,8 +1242,8 @@ def all_tests( for test in tests.tests: # Determine if the test is a match for the test_set provided by caller - match = "All" in test_set - match = match or test.name in test_set + # match = "All" in test_set + match = test.name in test_set for label in test.labels: match = match or label in test_set @@ -1194,20 +1256,6 @@ def all_tests( else: not_match.append(test.name) - if config.failures: - # Convert the list of failed tests into a map: test name to the - # number of failures (config.failures list may contain duplicates) - failures_map = {} - for test in config.failures: - if test in failures_map: - failures_map[test] += 1 - else: - failures_map[test] = 1 - error_string = "The following tests failed:" - for test, count in failures_map.items(): - error_string += f"\n {test} ({count} times)." - raise RuntimeError(error_string) - if verbose: print(f"Tests that ran: {match_run}") print(f"Tests that matched but did not run: {match_not_run}") @@ -1254,20 +1302,6 @@ def all_tests( "--target_device", type=str, required=True, help=target_device_help_string ) - # TODO(newling) make bool options boolean, not integer (tried but had issues) - parser.add_argument( - "--return-on-fail", - nargs="?", - default=1, - type=int, - help=dedent( - """ - If 0, then the script will continue running even if a test fails, - enumerating all failures. Otherwise the script will exit on the first failure. - """ - ), - ) - parser.add_argument( "-v", "--verbose", @@ -1306,6 +1340,20 @@ def all_tests( ), ) + parser.add_argument( + "--aie-compilation-flags", + type=str, + help=dedent( + """ + Additional flags to pass to the AIE compiler, for all tests. + Example, to print the IR between passes during compilation you might have: + --aie_compilation_flags="--mlir-print-ir-before-all --mlir-print-ir-module-scope + --aie2xclbin-print-ir-before-all --aie2xclbin-print-ir-module-scope"' + """ + ), + default="", + ) + tests = Tests() labels = tests.get_label_set() labels.append("All") @@ -1326,20 +1374,6 @@ def all_tests( default="All", ) - parser.add_argument( - "--additional-aie-compilation-flags", - type=str, - help=dedent( - """ - Additional flags to pass to the AIE compiler, for all tests. - Example, do print the IR between passes during compilation you might have: - --additional-aie-compilation-flags="--mlir-print-ir-before-all --mlir-print-ir-module-scope - --aie2xclbin-print-ir-before-all --aie2xclbin-print-ir-module-scope" - """ - ), - default="", - ) - parser.add_argument( "--device-hal", default="xrt-lite", @@ -1365,6 +1399,7 @@ def all_tests( raise ValueError( f"Invalid target device '{args.target_device}'. Available options: {current_devices}" ) + tests.add_aie_compilation_flags(args.aie_compilation_flags) all_tests( tests, @@ -1373,12 +1408,10 @@ def all_tests( args.peano_install_dir, args.xrt_dir, args.vitis_dir, - args.return_on_fail, args.verbose, args.reset_npu_between_runs, args.do_not_run_aie, test_set_list, - args.additional_aie_compilation_flags, args.device_hal, args.xrt_lite_n_core_rows, args.xrt_lite_n_core_cols, diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEOps.td b/compiler/plugins/target/AMD-AIE/aie/AIEOps.td index 729295535..9b8691810 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIEOps.td +++ b/compiler/plugins/target/AMD-AIE/aie/AIEOps.td @@ -24,7 +24,7 @@ class AIE_Op traits = []> : def AIE_DeviceOp: AIE_Op<"device", [ HasParent<"mlir::ModuleOp">, - SymbolTable, SingleBlock, NoTerminator, IsolatedFromAbove + SymbolTable, SingleBlockImplicitTerminator<"EndOp">, IsolatedFromAbove ]> { let summary = "Define an AIE design targetting a complete device"; let arguments = (ins AMDAIEDeviceAttr:$device); diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEX.td b/compiler/plugins/target/AMD-AIE/aie/AIEX.td index b52b1dc57..783f5c001 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIEX.td +++ b/compiler/plugins/target/AMD-AIE/aie/AIEX.td @@ -57,7 +57,13 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ OptionalAttr:$packet, FlatSymbolRefAttr:$metadata, I64Attr:$id, - DefaultValuedOptionalAttr:$issue_token + DefaultValuedOptionalAttr:$issue_token, + DefaultValuedOptionalAttr:$d0_zero_before, + DefaultValuedOptionalAttr:$d1_zero_before, + DefaultValuedOptionalAttr:$d2_zero_before, + DefaultValuedOptionalAttr:$d0_zero_after, + DefaultValuedOptionalAttr:$d1_zero_after, + DefaultValuedOptionalAttr:$d2_zero_after ); let assemblyFormat = [{ @@ -186,6 +192,7 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd"> { I32Attr:$d0_stride, I32Attr:$d1_size, I32Attr:$d1_stride, + I32Attr:$d2_size, I32Attr:$d2_stride, I32Attr:$iteration_current, I32Attr:$iteration_size, @@ -198,7 +205,13 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd"> { I32Attr:$lock_rel_id, I32Attr:$lock_acq_enable, I32Attr:$lock_acq_val, - I32Attr:$lock_acq_id + I32Attr:$lock_acq_id, + I32Attr:$d0_zero_before, + I32Attr:$d1_zero_before, + I32Attr:$d2_zero_before, + I32Attr:$d0_zero_after, + I32Attr:$d1_zero_after, + I32Attr:$d2_zero_after ); let results = (outs ); let assemblyFormat = [{ attr-dict }]; diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp b/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp index b604b7e30..0114adcee 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp @@ -30,6 +30,10 @@ void AIEXDialect::initialize() { #define GET_OP_CLASSES #include "aie/AIEX.cpp.inc" +//===----------------------------------------------------------------------===// +// NpuDmaMemcpyNdOp +//===----------------------------------------------------------------------===// + llvm::SmallVector AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() { MemRefType buffer = getMemref().getType(); diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp index 2e790a7bf..171898f83 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp @@ -194,7 +194,8 @@ LogicalResult runOnPacketFlow( Port destPort = {(pktDest.getBundle()), pktDest.getChannel()}; TileLoc destCoord = {destTile.getCol(), destTile.getRow()}; if (pktFlowOp->hasAttr("keep_pkt_header")) - keepPktHeaderAttr[PhysPort{destCoord, destPort}] = + keepPktHeaderAttr[PhysPort{destCoord, destPort, + PhysPort::Direction::DST}] = StringAttr::get(Op.getContext(), "true"); assert(srcPort.bundle != StrmSwPortType::SS_PORT_TYPE_MAX && srcPort.channel != -1 && "expected srcPort to have been set"); @@ -243,8 +244,10 @@ LogicalResult runOnPacketFlow( SmallVector slavePorts; for (const auto &[tileId, connects] : switchboxes) { for (const auto &[conn, flowID] : connects) { - PhysPortAndID sourceFlow = {PhysPort{tileId, conn.src}, flowID}; - packetFlows[sourceFlow].insert({PhysPort{tileId, conn.dst}, flowID}); + PhysPortAndID sourceFlow = { + PhysPort{tileId, conn.src, PhysPort::Direction::SRC}, flowID}; + packetFlows[sourceFlow].insert( + {PhysPort{tileId, conn.dst, PhysPort::Direction::DST}, flowID}); slavePorts.push_back(sourceFlow); } } @@ -307,7 +310,7 @@ LogicalResult runOnPacketFlow( std::sort(tileMasters.begin(), tileMasters.end()); for (Port tileMaster : tileMasters) { std::vector> amsels = - masterSets[{tileLoc, tileMaster}]; + masterSets[{tileLoc, tileMaster, PhysPort::Direction::DST}]; std::vector amselVals; for (std::pair amsel : amsels) { assert(amselOps.count(amsel) == 1 && "expected amsel in amselOps"); @@ -316,7 +319,8 @@ LogicalResult runOnPacketFlow( auto msOp = builder.create( builder.getUnknownLoc(), builder.getIndexType(), (tileMaster.bundle), tileMaster.channel, amselVals); - if (auto pktFlowAttrs = keepPktHeaderAttr[{tileLoc, tileMaster}]) + if (auto pktFlowAttrs = keepPktHeaderAttr[{tileLoc, tileMaster, + PhysPort::Direction::DST}]) msOp->setAttr("keep_pkt_header", pktFlowAttrs); } diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp index 7d8df1b1d..06bdea543 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp @@ -301,6 +301,7 @@ struct DmaToNpuPattern : OpConversionPattern { auto d0_stride = zero; auto d1_size = zero; auto d1_stride = zero; + auto d2_size = zero; auto d2_stride = zero; auto iteration_current = zero; auto iteration_size = zero; @@ -314,6 +315,12 @@ struct DmaToNpuPattern : OpConversionPattern { auto lock_acq_enable = zero; auto lock_acq_val = zero; auto lock_acq_id = zero; + auto d0_zero_before = zero; + auto d1_zero_before = zero; + auto d2_zero_before = zero; + auto d0_zero_after = zero; + auto d1_zero_after = zero; + auto d2_zero_after = zero; auto issue_token = BoolAttr::get(ctx, false); auto repeat_count = zero; @@ -361,6 +368,9 @@ struct DmaToNpuPattern : OpConversionPattern { // d1_stride if (strides[1]) d1_stride = IntegerAttr::get(i32ty, strides[1] - 1); + // d2_size + if (strides[3]) d2_size = IntegerAttr::get(i32ty, sizes[2]); + // d2_stride if (strides[2]) d2_stride = IntegerAttr::get(i32ty, strides[2] - 1); @@ -389,12 +399,32 @@ struct DmaToNpuPattern : OpConversionPattern { // This logic is kept for now for backward compatibility. if (!isMM2S) issue_token = BoolAttr::get(ctx, true); + // d0_zero_before + d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore()); + + // d1_zero_before + d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore()); + + // d2_zero_before + d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore()); + + // d0_zero_after + d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter()); + + // d1_zero_after + d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter()); + + // d2_zero_after + d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter()); + rewriter.create( op->getLoc(), column, bd_id, buffer_length, buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, d0_size, - d0_stride, d1_size, d1_stride, d2_stride, iteration_current, + d0_stride, d1_size, d1_stride, d2_size, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd, - lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id); + lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id, + d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after, + d1_zero_after, d2_zero_after); AMDAIEDeviceModel tm = getDeviceModel(static_cast(dev.getDevice())); diff --git a/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt index 3f14932d9..8dfe5a24c 100644 --- a/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt @@ -203,6 +203,27 @@ replace_string_in_file( ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp "memtileToSizeMap[t] = m.getTargetModel().getMemTileSize()" "memtileToSizeMap[t] = m.getTargetModel().getMemTileSize(t.getCol(), t.getRow())") +replace_string_in_file( + ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp + "targetModel.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)" + "true") +replace_string_in_file( + ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp + "device.getTargetModel().hasProperty(AIE::AIETargetModel::IsNPU)" + "true") +replace_string_in_file( + ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp + "AIE::getTargetModel(*device)" + "getDeviceModel(*device)") + +replace_string_in_file( + ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIESchedulingUtils.cpp + "targetModel.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)" + "true") +replace_string_in_file( + ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIESchedulingUtils.cpp + "target_model.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)" + "true") replace_string_in_file( ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRRtToNpuPass.cpp diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir index 8e58250c0..44c6eba7c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir @@ -43,6 +43,7 @@ module { d1_stride = 7 : i32, d1_size = 8 : i32, d2_stride = 9 : i32, + d2_size = 14 : i32, ddr_id = 10 : i32, iteration_current = 11 : i32, iteration_stride = 12 : i32, @@ -54,7 +55,14 @@ module { lock_rel_val = 4 : i32, next_bd = 5 : i32, use_next_bd = 1 : i32, - valid_bd = 1 : i32} + valid_bd = 1 : i32, + d0_zero_before = 0 : i32, + d1_zero_before = 1 : i32, + d2_zero_before = 2 : i32, + d0_zero_after = 3 : i32, + d1_zero_after = 4 : i32, + d2_zero_after = 5 : i32 + } // CHECK: 00000000 // CHECK: 00000000 diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir index 6130aa54b..e0eacc703 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir @@ -13,7 +13,7 @@ // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) // CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]]) // CHECK-DAG: aie.mem(%[[TILE_0_2]]) // CHECK-DAG: aie.mem(%[[TILE_0_3]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index 429089cdd..7b0b58026 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s @@ -20,7 +20,7 @@ // CHECK: aie.use_lock // Check a bit of the aiex.runtime_sequence: // CHECK: aiex.runtime_sequence @matmul_i32() -// CHECK: } {npu_instructions = dense_resource : tensor<174xui32>, runtime_sequence_name = "matmul_i32"} +// CHECK: } {npu_instructions = dense_resource : tensor<208xui32>, runtime_sequence_name = "matmul_i32"} #pipeline_layout = #hal.pipeline.layout, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir index 9229da0c3..b69322068 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir @@ -14,7 +14,7 @@ // CHECK-DAG: aie.core(%[[TILE_0_3]]) // CHECK-DAG: aie.core(%[[TILE_1_3]]) // CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// CHECK-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // CHECK-DAG: aie.memtile_dma(%[[TILE_0_1]]) // CHECK-DAG: aie.mem(%[[TILE_0_2]]) // CHECK-DAG: aie.mem(%[[TILE_0_3]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir index 326a178e5..210b1ce99 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir @@ -15,7 +15,7 @@ // PHOENIX-DAG: aie.core(%[[TILE_0_3]]) // PHOENIX-DAG: aie.core(%[[TILE_1_3]]) // PHOENIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// PHOENIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// PHOENIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // PHOENIX-DAG: aie.memtile_dma(%[[TILE_0_1]]) // PHOENIX-DAG: aie.mem(%[[TILE_0_2]]) // PHOENIX-DAG: aie.mem(%[[TILE_0_3]]) @@ -39,7 +39,7 @@ // STRIX-DAG: aie.core(%[[TILE_0_3]]) // STRIX-DAG: aie.core(%[[TILE_1_3]]) // STRIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 0) -// STRIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 1, 0) +// STRIX-DAG: aie.shim_dma_allocation {{.*}}(MM2S, 0, 1) // STRIX-DAG: aie.memtile_dma(%[[TILE_0_1]]) // STRIX-DAG: aie.mem(%[[TILE_0_2]]) // STRIX-DAG: aie.mem(%[[TILE_0_3]]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp index 05f004545..1189306b6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp @@ -4,10 +4,10 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" -#include "mlir/IR/IRMapping.h" -#include "mlir/IR/Iterators.h" #define DEBUG_TYPE "iree-amdaie-access-to-acquire-release" @@ -15,86 +15,111 @@ namespace mlir::iree_compiler::AMDAIE { namespace { +/// Some blocks have terminator ops, which must appear as the very last op in +/// the block. If `block` has a terminator, set the insertion point of +/// `rewriter` to just before the terminator, ready to create a new penultimate +/// op in the block. Otherwise, set the insertion point to the very end of the +/// block. +void setInsertionToEnd(IRRewriter &rewriter, Block *block) { + if (block->back().hasTrait()) { + rewriter.setInsertionPoint(block->getTerminator()); + } else { + rewriter.setInsertionPointToEnd(block); + } +} + +llvm::MapVector> +getFifosToAccesses(AMDAIE::CoreOp coreOp, AMDAIE::MemoryAccess type) { + llvm::MapVector> + accesses; + coreOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getAccessType() != type) return WalkResult::advance(); + Value input = accessOp.getInput(); + auto iter = accesses.find(input); + if (iter == accesses.end()) { + accesses.insert({input, {accessOp}}); + } else { + iter->second.push_back(accessOp); + } + return WalkResult::advance(); + }); + return accesses; +} + /// Walk all read access operations within the core operations and insert /// semaphore acquire and release stubs. Acquire operations will be inserted -/// at the location of the access operation and release operations will be -/// inserted before the next access or at the end of the block. +/// at the location of the access operation, and release operations will be +/// inserted some time before the next read access. LogicalResult readAccessToAcquireRelease(Operation *parentOp) { + AMDAIE::MemoryAccess accessType = AMDAIE::MemoryAccess::Read; + AMDAIE::LogicalObjectFifoPort port = LogicalObjectFifoPort::Consume; + IRRewriter rewriter(parentOp->getContext()); SmallVector coreOps; parentOp->walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); }); - // Map from DMA source/target logical objectFifos to those respective DMA - // operations. - DenseMap logicalObjectFifoToDma; + // Map from the source and target amdaie.logicalobjectfifo values of + // amdaie.connections to the amdaie.connections themselves. + DenseMap logicalObjectFifoToConnection; parentOp->walk([&](AMDAIE::ConnectionOp dmaOp) { - logicalObjectFifoToDma[dmaOp.getSource()] = dmaOp; - logicalObjectFifoToDma[dmaOp.getTarget()] = dmaOp; + logicalObjectFifoToConnection.insert({dmaOp.getSource(), dmaOp}); + logicalObjectFifoToConnection.insert({dmaOp.getTarget(), dmaOp}); }); for (AMDAIE::CoreOp coreOp : coreOps) { - llvm::MapVector - logicalObjectFifoToLastAccess; - WalkResult res = - coreOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { - if (accessOp.getAccessType() != AMDAIE::MemoryAccess::Read) - return WalkResult::advance(); - - if (logicalObjectFifoToLastAccess.contains(accessOp.getInput())) { - rewriter.setInsertionPoint(accessOp); - rewriter.create( - rewriter.getUnknownLoc(), - logicalObjectFifoToDma[accessOp.getInput()].getResult(), - LogicalObjectFifoPort::Consume); - } + auto fifosToAccesses = getFifosToAccesses(coreOp, accessType); - if (!logicalObjectFifoToDma.contains(accessOp.getInput())) { - accessOp.emitOpError() - << "read access not found as source of DMA operation"; - return WalkResult::interrupt(); - } - rewriter.setInsertionPoint(accessOp); - auto acquireOp = rewriter.create( - rewriter.getUnknownLoc(), - llvm::cast(accessOp.getInput().getType()), - logicalObjectFifoToDma[accessOp.getInput()].getResult(), - LogicalObjectFifoPort::Consume); - auto newAccessOp = rewriter.create( - rewriter.getUnknownLoc(), acquireOp.getResult(), - AMDAIE::MemoryAccess::Read); - rewriter.replaceAllUsesWith(accessOp.getResult(), - newAccessOp.getResult()); - logicalObjectFifoToLastAccess[accessOp.getInput()] = accessOp; - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); + for (auto &&[logicalObjectFifo, accessOps] : fifosToAccesses) { + for (uint64_t i = 0; i < accessOps.size(); ++i) { + AMDAIE::LogicalObjectFifoAccessOp accessOp = accessOps[i]; - // Insert release for remaining read access operations at end of block. - for (auto &&[value, accessOp] : logicalObjectFifoToLastAccess) { - Block *parentBlock = accessOp->getBlock(); - if (!parentBlock->back().hasTrait()) { - rewriter.setInsertionPointToEnd(parentBlock); - } else { - rewriter.setInsertionPoint(parentBlock->getTerminator()); - } - if (!logicalObjectFifoToDma.contains(accessOp.getInput())) { - accessOp.emitOpError() - << "read access not found as source of DMA operation"; - return failure(); + Value input = accessOp.getInput(); + if (!logicalObjectFifoToConnection.contains(input)) { + return accessOp.emitOpError() + << "does not have a connection in the logicalobjectfifo map"; + } + + // Insert the access op. + rewriter.setInsertionPoint(accessOp); + Block *block = accessOp->getBlock(); + auto acquireOp = rewriter.create( + rewriter.getUnknownLoc(), + llvm::cast(input.getType()), + logicalObjectFifoToConnection[input].getResult(), port); + auto newAccessOp = rewriter.create( + rewriter.getUnknownLoc(), acquireOp.getResult(), accessType); + rewriter.replaceAllUsesWith(accessOp.getResult(), + newAccessOp.getResult()); + + // Insert the release op. The location of the release is as close to the + // following access op as possible, but always in the same block as the + // access op being released. + AMDAIE::LogicalObjectFifoAccessOp nextAccessOp; + if (i + 1 != accessOps.size()) nextAccessOp = accessOps[i + 1]; + Operation *nextAccessOpsAncestor = + getAncestorInBlock(nextAccessOp, block); + if (nextAccessOpsAncestor && + nextAccessOpsAncestor->getBlock() == block) { + rewriter.setInsertionPoint(nextAccessOpsAncestor); + } else { + setInsertionToEnd(rewriter, block); + } + rewriter.create( + rewriter.getUnknownLoc(), + logicalObjectFifoToConnection[input].getResult(), port); } - rewriter.create( - rewriter.getUnknownLoc(), logicalObjectFifoToDma[accessOp.getInput()], - LogicalObjectFifoPort::Consume); } } return success(); } /// Walk all write access operations within the core operations and insert -/// semaphore operations. Release operations will be inserted -/// at the location of the access operation and acquire operations will be -/// inserted after the preceding access or at the beginning of the block. +/// semaphore operations. Release operations will be inserted at the location of +/// the access operation and acquire operations will be inserted after the +/// preceding access or at the beginning of the block. TODO(newling): update +/// this to ensure that corresponding accesses and releases are in the same +/// block, as in the case of `readAccessToAcquireRelease`. LogicalResult writeAccessToAcquireRelease(Operation *parentOp) { IRRewriter rewriter(parentOp->getContext()); @@ -214,7 +239,7 @@ class AMDAIEAccessToAcquireReleasePass AMDAIEAccessToAcquireReleasePass() = default; AMDAIEAccessToAcquireReleasePass( - const AMDAIEAccessToAcquireReleasePass &pass) {}; + const AMDAIEAccessToAcquireReleasePass &pass){}; void runOnOperation() override; }; @@ -225,6 +250,7 @@ void AMDAIEAccessToAcquireReleasePass::runOnOperation() { "acquire-release semaphore stubs"; return signalPassFailure(); } + if (failed(writeAccessToAcquireRelease(parentOp))) { parentOp->emitOpError() << "failed to convert write access operations to " "acquire-release semaphore stubs"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp index cb5e66424..cbbc2cf8b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp @@ -9,9 +9,7 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/Transforms/AMDAIEOpUtils.h" #include "iree-amd-aie/Transforms/Passes.h" -#include "iree-amd-aie/Transforms/Transforms.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/Support/MathExtras.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/Dialect/SCF/Utils/Utils.h" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp new file mode 100644 index 000000000..7b4acc0b1 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp @@ -0,0 +1,429 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelGenerator.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/IR/Verifier.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-amdaie-assign-tiles" + +namespace mlir::iree_compiler::AMDAIE { + +/// Return the tiles of the sources respectively targets of the users of this +/// logical objectfifo, depending on whether the OperateOn template parameter is +/// set to `OperateOn::Source` respectively `OperateOn::Target`. +template +LogicalResult getUserTiles( + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + SmallVectorImpl &tiles) { + llvm::SmallSetVector tileSet; + for (Operation *user : logicalObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(user)) { + ValueRange tileIndices; + if constexpr (OperateOn == CopyOpOperateOn::Source) { + if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getSourceObjectFifo().getTiles(); + } else if constexpr (OperateOn == CopyOpOperateOn::Target) { + if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getTargetObjectFifo().getTiles(); + } + // Only fill in tiles when all sources have tiles. + if (tileIndices.empty()) return failure(); + for (Value index : tileIndices) { + tileSet.insert( + dyn_cast_if_present(index.getDefiningOp())); + } + } + } + tiles = tileSet.takeVector(); + return success(); +} + +/// Utility to recursively find users of the provided logical objectFifo inside +/// `amdaie.core` operations and return the tile coordinates. +LogicalResult findUsersInCoreAndAddTiles( + Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + llvm::SmallSetVector, 16> &tiles) { + for (Operation *userOp : op->getUsers()) { + if (auto coreOp = userOp->getParentOfType()) { + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + std::optional column = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (!column || !row) + return coreOp.emitOpError() << "has non-constant tile location"; + tiles.insert(std::make_pair(column.value(), row.value())); + } + if (auto subviewOp = dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); + } else if (auto userLogicalObjectFifo = + dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(userLogicalObjectFifo, + logicalObjectFifo, tiles); + } + } + return success(); +} + +/// Utility to clear non-local tile assignments. +LogicalResult clearNonLocalTiles(RewriterBase &rewriter, Operation *op) { + op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp objFifo) { + if (objFifo.getMemorySpaceAsUInt() != 2) { + rewriter.setInsertionPoint(objFifo); + SmallVector tiles; + rewriter.replaceOpWithNewOp( + objFifo, cast(objFifo.getOutput().getType()), + objFifo.getMemref(), tiles); + } + }); + return success(); +} + +/// Utility to duplicate global objectFifos (L3) for each strided copy-like +/// operation user to allow global logical objectFifos to be assigned to +/// different tile locations. +LogicalResult duplicateGlobalObjFifos(RewriterBase &rewriter, Operation *op) { + op->walk([&](AMDAIE::DoublyStridedCopyOpInterface copyOp) { + auto source = dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + auto target = dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + auto createNewObjFifoAndReplaceUsesFrom = + [&](AMDAIE::LogicalObjectFifoFromMemrefOp oldObjFifo) { + rewriter.setInsertionPoint(copyOp); + auto newObjFifo = + rewriter.create( + rewriter.getUnknownLoc(), + cast(oldObjFifo.getOutput().getType()), + oldObjFifo.getMemref()); + rewriter.replaceUsesWithIf( + oldObjFifo.getOutput(), newObjFifo.getOutput(), + [&](OpOperand &use) { + return use.getOwner() == copyOp.getOperation(); + }); + }; + if (source && source.getMemorySpaceAsUInt() == 0) { + createNewObjFifoAndReplaceUsesFrom(source); + } + if (target && target.getMemorySpaceAsUInt() == 0) { + createNewObjFifoAndReplaceUsesFrom(target); + } + }); + return success(); +} + +/// Assign tiles to the logical objectfifos with local memory space (L1). +/// The tiles are derived from the usage of the logical objectfifos within +/// core operations, which are already assigned a tile location. +LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op) { + WalkResult res = + op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (!memSpace || dyn_cast(memSpace).getInt() != 2) + return WalkResult::advance(); + + llvm::SmallSetVector, 16> tileLocations; + if (failed(findUsersInCoreAndAddTiles( + logicalObjectFifo, logicalObjectFifo, tileLocations))) { + return WalkResult::interrupt(); + } + // Handle subviews. + for (Operation *userOp : + logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { + if (auto subviewOp = dyn_cast(userOp)) { + if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, + tileLocations))) { + return WalkResult::interrupt(); + } + } + } + + SmallVector tiles; + tiles.reserve(tileLocations.size()); + rewriter.setInsertionPoint(logicalObjectFifo); + for (auto [column, row] : tileLocations) { + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), row); + auto tileOp = rewriter.create( + rewriter.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + } + // Sort for deterministic output IR. + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tiles); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +/// Assign a set of candidate physical AIE tiles to logical objectFifos. This +/// rewrite takes an iterative approach by matching logical objectfifos and only +/// assigning tiles when linked through dma ops with other logical objectfifos +/// which already have tiles assigned. If the linked logical objectfifos don't +/// have tiles assigned yet, we will return a failure and give the linked +/// logical objectfifos a chance to assign tiles before returning to this one. +class FillTiles + : public OpRewritePattern { + using OpRewritePattern< + AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; + + public: + FillTiles(MLIRContext *context, const AMDAIE::AMDAIEDeviceModel &deviceModel) + : OpRewritePattern(context), deviceModel(deviceModel) {} + + LogicalResult matchAndRewrite( + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + PatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "FillTiles: " << logicalObjectFifo << "\n"); + if (!logicalObjectFifo.getTiles().empty()) { + return rewriter.notifyMatchFailure(logicalObjectFifo, + "Tiles are already assigned."); + } + uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); + if (memSpace != 0 && memSpace != 1) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Skip logical objFifos that don't operate on L3 or L2"); + } + + SmallVector targetTiles; + SmallVector sourceTiles; + LogicalResult dstRes = + getUserTiles(logicalObjectFifo, targetTiles); + LogicalResult srcRes = + getUserTiles(logicalObjectFifo, sourceTiles); + if (failed(dstRes) && failed(srcRes)) { + return rewriter.notifyMatchFailure(logicalObjectFifo, + "No source or target tiles found"); + } + + SmallVector memSpaceRows = deviceModel.getMemSpaceRows(memSpace); + if (memSpaceRows.size() == 0) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "No rows found for the memory space of this logical objFifo"); + } + if (memSpaceRows.size() > 1) { + logicalObjectFifo.emitWarning() + << "has a memory space with multiple available rows, the first one " + "of which is chosen for tile assignment, but this might not lead " + "to good usage of the available resources."; + } + uint32_t row = memSpaceRows[0]; + llvm::SmallSetVector, 16> tileLocations; + auto createTileLocations = + [&](SmallVector &tiles) -> LogicalResult { + // For deterministic and canonical output, sort on column index and erase + // duplicates. + std::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileColumnComparator); + tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); + for (AMDAIE::TileOp tile : tiles) { + std::optional column = getConstantIntValue(tile.getCol()); + if (!column) + return rewriter.notifyMatchFailure(tile, "found non-constant column"); + tileLocations.insert(std::make_pair(column.value(), row)); + } + return success(); + }; + + if (!targetTiles.empty() && !sourceTiles.empty()) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Found logical objectfifo with both source and target tiles, which " + "is not supported yet"); + } else if (!targetTiles.empty()) { + // Create tile locations for this logical objectfifo based on the + // consumers' tiles. + if (failed(createTileLocations(targetTiles))) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Could not find tile locations based on the consumers' tiles."); + } + } else if (!sourceTiles.empty()) { + // Create tile locations for this logical objectfifo based on producers' + // tiles. + if (failed(createTileLocations(sourceTiles))) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Could not find tile locations based on the producers' tiles."); + } + } else { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "Don't assign this logicalObjectFifo to a physical tile (yet!). Wait " + "for other logical objectfifos to be assigned first."); + } + + if (tileLocations.empty()) { + return rewriter.notifyMatchFailure( + logicalObjectFifo, + "No tile locations found for this logical objFifo. Maybe in a next " + "iteration, with more information, a tile location can be found."); + } + rewriter.setInsertionPoint(logicalObjectFifo); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, logicalObjectFifo.getMemref(), + tileLocations.takeVector()); + return success(); + } + + private: + // The device model used to retrieve device specific information. + const AMDAIEDeviceModel &deviceModel; +}; + +/// Assign tile locations to objectFifos. Start by searching for a set of +/// candidate tile locations and then assign tiles based on a simple usage-based +/// model that prioritizes tiles that have the least usage. +LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op, + const AMDAIEDeviceModel &deviceModel) { + MLIRContext *context = rewriter.getContext(); + if (failed(clearNonLocalTiles(rewriter, op))) + return op->emitOpError() << "failed to clear non-local tile assignemts"; + + // Find and fill the tile candidates. + RewritePatternSet fillTilePatterns(context); + fillTilePatterns.insert(context, deviceModel); + if (failed(applyPatternsAndFoldGreedily(op, std::move(fillTilePatterns)))) { + return op->emitOpError() + << "collection of tile candidates for logical objectFifos failed"; + } + if (failed(verify(op, true))) { + return failure(); + } + LLVM_DEBUG(llvm::dbgs() << "After fillTiles: \n" << *op << "\n"); + + // Keep track of the buffer usage on tiles to try distributing buffers evenly + // over available tile resources. + DenseMap tileLocToUsage; + auto tileLocAndUsageCmp = [&](AMDAIE::TileOp a, AMDAIE::TileOp b) -> bool { + int64_t colA = getConstantIndexOrAssert(a.getCol()); + int64_t rowA = getConstantIndexOrAssert(a.getRow()); + int64_t colB = getConstantIndexOrAssert(b.getCol()); + int64_t rowB = getConstantIndexOrAssert(b.getRow()); + size_t usageA = tileLocToUsage[TileLoc(colA, rowA)]; + size_t usageB = tileLocToUsage[TileLoc(colB, rowB)]; + if (usageA < usageB) return true; + if (usageA > usageB) return false; + if (colA < colB) return true; + if (colA > colB) return false; + if (rowA < rowB) return true; + if (rowA > rowB) return false; + assert(false && "same tiles should never be compared"); + }; + + // After filling tile candidates, find and assign a specific one. + DenseMap logicalObjFifoToTileId; + WalkResult res = + op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); + if (memSpace != 0 && memSpace != 1) return WalkResult::advance(); + if (logicalObjectFifo.getTiles().size() == 0) { + logicalObjectFifo.emitOpError() + << "should have at least one tile candidate"; + return WalkResult::interrupt(); + } + + SmallVector tiles = + llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { + return dyn_cast_if_present(tile.getDefiningOp()); + }); + AMDAIE::TileOp assignedTileOp = + *std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp); + + // Increase usage of the chosen tile as a new logical objectFifo will be + // assigned to it. This allows distributing the logical objectFifos + // evenly across the available tile resources. + int64_t col = getConstantIndexOrAssert(assignedTileOp.getCol()); + int64_t row = getConstantIndexOrAssert(assignedTileOp.getRow()); + tileLocToUsage[TileLoc(col, row)] += 1; + + rewriter.setInsertionPoint(logicalObjectFifo); + SmallVector tileResults = { + cast(assignedTileOp.getResult())}; + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tileResults); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +namespace { + +class AMDAIEAssignTilesPass + : public impl::AMDAIEAssignTilesBase { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override; +}; + +void AMDAIEAssignTilesPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(&getContext()); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to looking up column and " + "row related information, and must be attached to a containing " + "ModuleOp."; + return signalPassFailure(); + } + AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); + + // Assign tile locations to logical objectFifos on local (L1) memory. + if (failed(assignLocalTiles(rewriter, parentOp))) { + parentOp->emitOpError() << "local tile assignment failed"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "After assignLocalTiles: \n" << *parentOp << "\n"); + + // Duplicate global objectFifos for each strided copy-like operation user to + // allow global logical objectFifos to be assigned to different tile + // locations. + if (failed(duplicateGlobalObjFifos(rewriter, parentOp))) { + parentOp->emitOpError() << "failed duplicating global object fifos"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "After duplicateGlobalObjFifos: \n" + << *parentOp << "\n"); + + // Assign tile locations to logical objectFifos on non-local (not L1) memory. + if (failed(assignNonLocalTiles(rewriter, parentOp, deviceModel))) { + parentOp->emitOpError() << "local tile assignment failed"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "After assignNonLocalTiles: \n" + << *parentOp << "\n"); +} + +} // namespace + +std::unique_ptr createAMDAIEAssignTilesPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp index 2f2ee3297..e42b6d597 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp @@ -8,7 +8,6 @@ #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" #include "mlir/IR/Iterators.h" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp new file mode 100644 index 000000000..008bf4124 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp @@ -0,0 +1,75 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the conversion of `scf.forall` within `amdaie.controlcode` +// ops into `scf.for` operations. This can help discover new control code +// optimization opportunities. +// +//===----------------------------------------------------------------------===// + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" + +#define DEBUG_TYPE "iree-amdaie-controlcode-forall-to-for" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Converts `scf.forall` operations found within the provided op into nested +/// `scf.for` operations. +LogicalResult forallToFor(RewriterBase &rewriter, Operation *op) { + WalkResult res = op->walk([&](scf::ForallOp forallOp) { + rewriter.setInsertionPoint(forallOp); + if (succeeded(forallOp.promoteIfSingleIteration(rewriter))) { + return WalkResult::advance(); + } + if (failed(scf::forallToForLoop(rewriter, forallOp))) { + forallOp.emitOpError() << "was not transformed from `scf.forall` to `scf.for`"; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +class AMDAIEControlCodeForallToForPass + : public impl::AMDAIEControlCodeForallToForBase< + AMDAIEControlCodeForallToForPass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEControlCodeForallToForPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(parentOp->getContext()); + parentOp->walk([&](AMDAIE::ControlCodeOp controlCodeOp) { + if (failed(forallToFor(rewriter, controlCodeOp.getOperation()))) { + return signalPassFailure(); + } + // Make sure to hoist `affine.apply` ops out of the innermost `scf.for` ops + // if applicable. + controlCodeOp->walk([&](affine::AffineApplyOp applyOp) { + (void)hoistForAffineApplyOp(rewriter, applyOp); + }); + }); +} + +} // namespace + +std::unique_ptr createAMDAIEControlCodeForallToForPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index 4ed6d0bb0..d92f23af9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -93,10 +93,10 @@ class TransactionBuilder { uint32_t addr = direction == AMDAIE::DMAChannelDir::MM2S ? 0x1D214 : 0x1D204; if (channel == 1) addr += 0x8; - if (col && row) { - addr |= ((col & 0xff) << colShift) | ((row & 0xff) << rowShift) | - (addr & 0xFFFFF); - } + // TODO(jornt): use aie-rt's transaction serializer instead to avoid these + // indiscrepancies between this file and aie-rt. + addr = ((col & 0xff) << colShift) | ((row & 0xff) << rowShift) | + (addr & 0xFFFFF); uint32_t value = 0; value |= bdId & 0xF; value |= (repeatCount & 0xFF) << 16; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index dbd439458..26a269935 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/Transforms/Transforms.h" #include "llvm/Support/Debug.h" @@ -323,37 +324,6 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern { } }; -/// Return the tiles of the sources respectively targets of the users of this -/// logical objectfifo, depending on whether the OperateOn template parameter is -/// set to `OperateOn::Source` respectively `OperateOn::Target`. -template -LogicalResult getUserTiles( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - SmallVectorImpl &tiles) { - llvm::SmallSetVector tileSet; - for (Operation *user : logicalObjectFifo->getUsers()) { - if (auto dmaOp = dyn_cast(user)) { - ValueRange tileIndices; - if constexpr (OperateOn == CopyOpOperateOn::Source) { - if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; - tileIndices = dmaOp.getSourceObjectFifo().getTiles(); - } else if constexpr (OperateOn == CopyOpOperateOn::Target) { - if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; - tileIndices = dmaOp.getTargetObjectFifo().getTiles(); - } - - // Only fill in tiles when all sources have tiles. - if (tileIndices.empty()) return failure(); - for (Value index : tileIndices) { - tileSet.insert( - dyn_cast_if_present(index.getDefiningOp())); - } - } - } - tiles = tileSet.takeVector(); - return success(); -} - /// Insert `amdaie.logicalobjectfifo.access` operations which retrieve the /// memrefs from logical objectfifos and update the computational operations to /// operate on these local memrefs. @@ -454,229 +424,6 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { return success(); } -/// Utility to recursively find users of the provided logical objectFifo inside -/// `amdaie.core` operations and return the tile coordinates. -LogicalResult findUsersInCoreAndAddTiles( - Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - llvm::SmallSetVector, 16> &tiles) { - for (Operation *userOp : op->getUsers()) { - if (auto coreOp = userOp->getParentOfType()) { - AMDAIE::TileOp tileOp = coreOp.getTileOp(); - std::optional column = getConstantIntValue(tileOp.getCol()); - std::optional row = getConstantIntValue(tileOp.getRow()); - if (!column || !row) { - return coreOp.emitOpError() << "has non-constant tile location"; - } - tiles.insert(std::make_pair(column.value(), row.value())); - } - if (auto subviewOp = dyn_cast(userOp)) { - return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); - } else if (auto userLogicalObjectFifo = - dyn_cast(userOp)) { - return findUsersInCoreAndAddTiles(userLogicalObjectFifo, - logicalObjectFifo, tiles); - } - } - return success(); -} - -/// Assign tiles to the logical objectfifos with local memory space (L1). -/// The tiles are derived from the usage of the logical objectfifos within -/// core operations, which are already assigned a tile location. -LogicalResult assignLocalAieTiles(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - WalkResult res = moduleOp->walk( - [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (!memSpace || dyn_cast(memSpace).getInt() != 2) - return WalkResult::advance(); - - llvm::SmallSetVector, 16> tileLocations; - if (failed(findUsersInCoreAndAddTiles( - logicalObjectFifo, logicalObjectFifo, tileLocations))) { - return WalkResult::interrupt(); - } - // Handle subviews. - for (Operation *userOp : - logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { - if (auto subviewOp = dyn_cast(userOp)) { - if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, - tileLocations))) { - return WalkResult::interrupt(); - } - } - } - - SmallVector tiles; - tiles.reserve(tileLocations.size()); - rewriter.setInsertionPoint(logicalObjectFifo); - for (auto [column, row] : tileLocations) { - auto colIndex = rewriter.create( - rewriter.getUnknownLoc(), column); - auto rowIndex = rewriter.create( - rewriter.getUnknownLoc(), row); - auto tileOp = rewriter.create( - rewriter.getUnknownLoc(), colIndex, rowIndex); - tiles.push_back(tileOp.getResult()); - } - // Sort for deterministic output IR. - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileValueColumnAndRowComparator); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast( - logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tiles); - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - return success(); -} - -/// Assign a set of potential physical AIE tiles to logical objectFifos. This -/// rewrite takes an iterative approach by matching logical objectfifos and only -/// assigning tiles when linked through dma ops with other logical objectfifos -/// which already have tiles assigned. If the linked logical objectfifos don't -/// have tiles assigned yet, we will return a failure and give the linked -/// logical objectfifos a chance to assign tiles before returning to this one. -/// -/// TODO(jornt): There are decisions being made in this pass on which tiles to -/// assign to a logical objectfifo. This logic is very simple for now and tries -/// to use the tiles in the same columns as targets and sources. At some point, -/// we probably need some AIE device model to guide the assignement here for -/// performance and to avoid hardware resource issues later on. -class FillAieTiles - : public OpRewritePattern { - using OpRewritePattern< - AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; - - LogicalResult matchAndRewrite( - AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - PatternRewriter &rewriter) const override { - LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n"); - if (!logicalObjectFifo.getTiles().empty()) { - return failure(); - } - - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - // Skip logical objectfifos within local memory as they should already be - // assigned. - if (memSpace && dyn_cast(memSpace).getInt() == 2) { - if (logicalObjectFifo.getTiles().empty()) { - logicalObjectFifo.emitOpError() - << "found logical objectfifo on local memory space with no tiles " - "assigned."; - } - return failure(); - } - // HandLe both L3/shim and L2/Memtiles. - // Skip logical objectfifos within non-global and non-shared memory. - if (memSpace && dyn_cast(memSpace).getInt() != 1) { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with unknown memory space"; - } - - SmallVector targetTiles; - SmallVector sourceTiles; - LogicalResult dstRes = - getUserTiles(logicalObjectFifo, targetTiles); - LogicalResult srcRes = - getUserTiles(logicalObjectFifo, sourceTiles); - - // If no source and target tiles found, skip. - if (failed(dstRes) && failed(srcRes)) { - return failure(); - } - - // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie - // target model for this. - int64_t rowInt = memSpace ? 1 : 0; - llvm::SmallSetVector, 16> tileLocations; - auto createTileLocations = - [&](SmallVector &tiles) -> LogicalResult { - // TODO(jornt): For now, for deterministic behaviour, sort on column - // index and use first one. This needs to be generalized to assign - // tiles based on a resource model. - std::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - // Erase duplicates. - tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); - for (AMDAIE::TileOp tile : tiles) { - std::optional column = getConstantIntValue(tile.getCol()); - if (!column) return tile.emitOpError() << "found non-constant column"; - tileLocations.insert(std::make_pair(column.value(), rowInt)); - } - return success(); - }; - - if (!targetTiles.empty() && !sourceTiles.empty()) { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with both source and target tiles, " - "which is not supported yet"; - } else if (!targetTiles.empty()) { - // Create tile locations for this logical objectfifo based on target - // tiles. - if (failed(createTileLocations(targetTiles))) { - return failure(); - } - } else if (!sourceTiles.empty()) { - // Create tile locations for this logical objectfifo based on source - // tiles. - if (failed(createTileLocations(sourceTiles))) { - return failure(); - } - } else { - // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait - // for other logical objectfifos to be assigned first. - return failure(); - } - - // If no tile results, skip, and maybe in a next iteration another tile will - // be found. - if (tileLocations.empty()) { - return failure(); - } - - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, logicalObjectFifo.getMemref(), - tileLocations.takeVector()); - return success(); - } -}; - -/// Assign specific tile locations to objectFifos, starting from the set of -/// potential tile locations filled in earlier. -LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (memSpace && dyn_cast(memSpace).getInt() != 1) - return WalkResult::advance(); - - SmallVector tiles = - llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { - return dyn_cast_if_present(tile.getDefiningOp()); - }); - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - - // For now, use first tile in sorted list. - // TODO(jornt): This will need to become more complex in the future to - // account for potential hardware limitations and constraints. - SmallVector tileResults = {cast(tiles[0].getResult())}; - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast(logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tileResults); - return WalkResult::advance(); - }); - return success(); -} - class AMDAIEDistributeCoresAndObjectFifosPass : public impl::AMDAIEDistributeCoresAndObjectFifosBase< AMDAIEDistributeCoresAndObjectFifosPass> { @@ -694,6 +441,17 @@ class AMDAIEDistributeCoresAndObjectFifosPass void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { MLIRContext *context = &getContext(); ModuleOp moduleOp = getOperation(); + IRRewriter rewriter(moduleOp.getContext()); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(moduleOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + moduleOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required for tile assignment " + "purposes, and must be attached to a containing ModuleOp."; + return signalPassFailure(); + } + AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); // Convert local scf.forall operations selected for parallel distribution to // nested scf.for operations. @@ -750,7 +508,7 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { << moduleOp << "\n"); // Assign tile locations to logical objectfifos on local (L1) memory. - if (failed(assignLocalAieTiles(moduleOp))) { + if (failed(assignLocalTiles(rewriter, moduleOp))) { moduleOp.emitOpError() << "local tile assignment failed"; return signalPassFailure(); } @@ -759,40 +517,21 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { return signalPassFailure(); } - LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n" + LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalTiles: \n" << moduleOp << "\n"); - // Assign a set of potential tile locations to the remaining logical - // objectFifos. - RewritePatternSet assignAieTilePatters(context); - assignAieTilePatters.insert(context); - if (failed(applyPatternsAndFoldGreedily(moduleOp, - std::move(assignAieTilePatters)))) { - moduleOp.emitOpError() - << "collection of tile candidates for logical objectFifos failed"; + // Assign tile locations to logical objectfifos on non-local (not L1) memory. + if (failed(assignNonLocalTiles(rewriter, moduleOp, deviceModel))) { + moduleOp.emitOpError() << "local tile assignment failed"; return signalPassFailure(); } if (failed(verify(moduleOp, true))) { return signalPassFailure(); } - LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n" - << moduleOp << "\n"); - - // Assign specific tile locations to objectFifos, starting from the set of - // potential tile locations filled in earlier. - if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) { - moduleOp.emitOpError() - << "tile assignment and logical objectFifo distribution failed"; - return signalPassFailure(); - } - if (failed(verify(moduleOp, true))) { - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() - << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n" - << moduleOp << "\n"); + LLVM_DEBUG(llvm::dbgs() << "Module after assignNonLocalTiles: \n" + << moduleOp << "\n"); } } // namespace diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp index 710d0ddfb..0af3e8b32 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp @@ -44,7 +44,10 @@ FailureOr> getThreadIndVars(ModuleOp moduleOp) { } /// Try to detect subview(s) that look like they're 'distributing' L1 memory. -/// That is: they slice the L1 memory along thread/tile dimensions. +/// That is: they slice the L1 memory along thread/tile dimensions. If the +/// allocation `alloc` does not look like it's distributed across threads/tiles, +/// return an empty memref type. Otherwise, return the memref type that the +/// subviews are viewing. MemRefType getDistributedType(memref::AllocOp alloc, const DenseSet &indVars) { MemRefType type; @@ -54,12 +57,17 @@ MemRefType getDistributedType(memref::AllocOp alloc, // that if a subview has an offset which is not a constant and not a // thread id, it's not 'distributing'. Operation::operand_range offsets = subview.getOffsets(); + int nIndVars{0}; for (Value offset : offsets) { bool isConst = matchPattern(offset, m_Constant()); bool isIndVar = llvm::is_contained(indVars, offset); + nIndVars += isIndVar; if (!isConst && !isIndVar) return {}; } + // If there are no thread ids, this subview is not distributing. + if (nIndVars == 0) return {}; + auto nextType = cast(subview.getResult().getType()); if (!type) { type = nextType; @@ -95,7 +103,8 @@ LogicalResult distributeLocalMemory(ModuleOp moduleOp) { if (failed(maybeIndVars)) return failure(); const DenseSet &indVars = maybeIndVars.value(); IRRewriter rewriter(moduleOp.getContext()); - moduleOp->walk([&](memref::AllocOp oldAlloc) { + auto allocWalkResult = moduleOp->walk([&](memref::AllocOp oldAlloc) + -> WalkResult { // Only consider local memory (L1). Attribute maybeMemorySpace = oldAlloc.getType().getMemorySpace(); if (!maybeMemorySpace) return WalkResult::advance(); @@ -173,8 +182,8 @@ LogicalResult distributeLocalMemory(ModuleOp moduleOp) { return success(); }) .Default([&](Operation *user) { - user->emitOpError("needs logic implemented for handling."); - return failure(); + return user->emitOpError( + "needs logic implemented for handling."); }); if (failed(switchResult)) return WalkResult::interrupt(); @@ -183,6 +192,8 @@ LogicalResult distributeLocalMemory(ModuleOp moduleOp) { return WalkResult::advance(); }); + if (allocWalkResult.wasInterrupted()) return failure(); + return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp index 3ce342bc8..7f31b9a78 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp @@ -6,23 +6,13 @@ #include "AMDAIEDmaUtils.h" -#include - +#include "AMDAIEUtils.h" #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "llvm/ADT/SmallPtrSet.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" namespace mlir::iree_compiler::AMDAIE { -/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor. -Operation *getAncestorInBlock(Operation *op, Block *block) { - if (!op || !block) return nullptr; - auto parent = op; - while (parent && (parent->getBlock() != block)) - parent = parent->getParentOp(); - return parent; -} - bool areAccessPatternsCombinable(const SmallVector &offsetsA, const SmallVector &sizesA, const SmallVector &stridesA, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp index 09cbad45c..02b5ff597 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -8,31 +8,27 @@ #include -#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/AMDAIEUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/Support/Debug.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Iterators.h" #include "mlir/IR/Operation.h" #define DEBUG_TYPE "iree-amdaie-logicalobjfifo-splitting-utils" namespace mlir::iree_compiler::AMDAIE { -/// Utility to create a new logical objectfifo based on shape defined by -/// `newSizesOpFoldResultArr`. +/// Hardcoded the transposed dimensions of L2 target dma for now. +/// The values are based on the results from ConvertToDma with option as +/// transposed on target, e.g., dma size [1, 1, 32, 32] -> [1, 32, 1, 32]. +const static SmallVector transposedL2Dims = {0, 2, 1, 3}; + +/// Utility to create a new logical objectfifo. static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo( IRRewriter &rewriter, - AMDAIE::LogicalObjectFifoFromMemrefOp &oldLogicalObjectFifo, - SmallVectorImpl &newSizesOpFoldResultArr) { + AMDAIE::LogicalObjectFifoFromMemrefOp oldLogicalObjectFifo, + ArrayRef newSizes) { OpBuilder::InsertionGuard guard(rewriter); - SmallVector newSizes = llvm::map_to_vector( - newSizesOpFoldResultArr, - [](OpFoldResult sizeVal) { return getConstantIndexOrAssert(sizeVal); }); Value oldAllocOp = oldLogicalObjectFifo.getMemref(); auto oldMemRefType = cast(oldAllocOp.getType()); MemRefType newAllocType = MemRefType::get( @@ -55,22 +51,17 @@ static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo( return newLogicalObjectFifo; } -/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. -SmallVector fetchDmaCpyNdOpsToSplitOrCombine( - Operation *op) { - SmallVector l2ToL1DmaOps; - // We are currently walking through CoreOps gathering 3rd Input DmaOp (if - // applicable) from them. - // TODO(avarma): We will generalize this later. - op->walk([&](AMDAIE::CoreOp coreOp) { - SmallVector inputDmas = coreOp.getInputDmas(); - if (inputDmas.size() != 3) return WalkResult::skip(); - auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); - assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op"); - l2ToL1DmaOps.push_back(dmaCpyNdOp); - return WalkResult::advance(); - }); - return l2ToL1DmaOps; +/// Utility to create a new logical objectfifo based on shape defined by +/// `newSizesOpFoldResultArr`. +static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo( + IRRewriter &rewriter, + AMDAIE::LogicalObjectFifoFromMemrefOp oldLogicalObjectFifo, + ArrayRef newSizesOpFoldResultArr) { + OpBuilder::InsertionGuard guard(rewriter); + SmallVector newSizes = llvm::map_to_vector( + newSizesOpFoldResultArr, + [](OpFoldResult sizeVal) { return getConstantIndexOrAssert(sizeVal); }); + return createNewLogicalObjectFifo(rewriter, oldLogicalObjectFifo, newSizes); } /// Utility to verify that the split dimensions for L2 are contiguous. @@ -123,15 +114,14 @@ static LogicalResult checkIsRangeFromZero( /// . .|. .| /// . .|. .| /// ----- -static FailureOr updateL3SourceOffset(IRRewriter &rewriter, - OpFoldResult oldL3Offset, - int64_t offsetToAdd, - MLIRContext *context) { +static FailureOr addToOffset(IRRewriter &rewriter, + OpFoldResult oldL3Offset, + int64_t offsetToAdd) { auto createAffineMap = [&](AffineExpr affineExpr, int64_t offsetToAdd) -> AffineMap { AffineExpr newAffineExpr = affineExpr + offsetToAdd; return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, {newAffineExpr}, - context); + rewriter.getContext()); }; OpFoldResult newL3AsSourceOffset; OpBuilder::InsertionGuard guard(rewriter); @@ -305,17 +295,41 @@ static LogicalResult checkWhetherSplitIsPossible( return success(); } -// Given a vector of L2->L1 Dma ops' perform the splitting :- -// 1. Check if the splitting can be performed or not. If not possible, bail out. -// 2. For the split dimension inferred set offset = 0 and size as 1 for L2 and -// L3. -// 3. Now traverse each L2->L1 Dma op and perform the following :- -// a) Create a new L2 AllocOp based on the updated size (step 3 above) and -// create a logicalobjectfifo using the same. -// b) Split L3->L2 Dma op. -// c) SPlit L2->L1 Dma op. -// 4. Delete old L2->L1, L3->L2 and corresponding AllocOps. -LogicalResult splitLogicalObjectFifos( +/// Utility to determine if the strides of a dma copy operation might describe +/// a transposition of dimensions. Here we are only considering static strides. +/// If any of the static strides are in non-decreasing order from right to left, +/// then this might be a transpose. +static FailureOr isMaybeTransposed(Location loc, + ArrayRef strides) { + std::optional> maybeStrides = + getConstantIntValues(strides); + if (!maybeStrides) { + emitError(loc) << "expected static L2 strides"; + return failure(); + } + SmallVector staticStrides = maybeStrides.value(); + return !std::is_sorted(staticStrides.rbegin(), staticStrides.rend()); +} + +static FailureOr isDmaTransposedOnSourceSide(AMDAIE::DmaCpyNdOp dmaOp) { + return isMaybeTransposed(dmaOp->getLoc(), dmaOp.getSourceMixedStrides()); +} + +static FailureOr isDmaTransposedOnTargetSide(AMDAIE::DmaCpyNdOp dmaOp) { + return isMaybeTransposed(dmaOp->getLoc(), dmaOp.getTargetMixedStrides()); +} + +/// Given a vector of L2->L1 Dma ops' perform the splitting :- +/// 1. Check if the splitting can be performed. If it can't, bail out. +/// 2. For the split dimension inferred set offset = 0 and size as 1 for L2 and +/// L3. +/// 3. Now traverse each L2->L1 Dma op and perform the following :- +/// a) Create a new L2 AllocOp based on the updated size (step 2 above) and +/// create a logicalobjectfifo using the same. +/// b) Split L3->L2 Dma op. +/// c) Split L2->L1 Dma op. +/// 4. Delete old L2->L1, L3->L2 and corresponding AllocOps. +LogicalResult splitLogicalObjectFifoForElementwiseOp( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, MLIRContext *context) { SplittingLogicalObjectFifoData splittingLogicalObjectFifoData; @@ -351,31 +365,14 @@ LogicalResult splitLogicalObjectFifos( SmallVector staticL3AsSourceSizes = l3ToL2DmaOp.getSourceMixedSizes(); - LogicalObjectFifoFromMemrefOp l2TargetObjectFifo = - l3ToL2DmaOp.getTargetObjectFifo(); - ArrayRef l2TargetShape = - l2TargetObjectFifo.getMemrefType().getShape(); - if (l2TargetShape.size() != staticL2AsTargetSizes.size()) { - LLVM_DEBUG(llvm::dbgs() << "L2 target size should be the same"); - return failure(); - } - - // Check if the L3->L2 dma is transposed on the target side. - bool dmaTransposeOnSource = true; - for (auto [s1, s2] : llvm::zip_equal(l2TargetShape, staticL2AsTargetSizes)) { - if (s1 != getConstantIntValue(s2)) { - dmaTransposeOnSource = false; - break; - } - } - if (staticL3AsSourceSizes.size() != staticL2AsTargetSizes.size()) { - dmaTransposeOnSource = false; - } - OpFoldResult zeroVal = getAsIndexOpFoldResult(context, 0); OpFoldResult oneVal = getAsIndexOpFoldResult(context, 1); - if (dmaTransposeOnSource) { + FailureOr maybeTransposed = isDmaTransposedOnTargetSide(l3ToL2DmaOp); + if (failed(maybeTransposed)) return failure(); + bool dmaTransposeOnTarget = maybeTransposed.value(); + + if (!dmaTransposeOnTarget) { // Update split dimensions' offset/size for L2 as target and L3 as source. // We can afford to do this here because it's going to be the same for all // L3->L2 splits. Here we are setting offset = 0 and size = 1. @@ -389,14 +386,12 @@ LogicalResult splitLogicalObjectFifos( // The L2 target side has transposed dimensions, while the L3 source side // data are continuous and don't have `nonSplitDim`. Then the L3 source // sizes need to be modified to match the new L2 target sizes. - // Hardcoded the transposed dimensions for now. - const SmallVector transposeDim = {0, 2, 1, 3}; for (auto &&[splitDim, nonSplitdim] : llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) { - staticL2AsTargetOffsets[transposeDim[splitDim]] = zeroVal; - staticL2AsTargetSizes[transposeDim[splitDim]] = oneVal; + staticL2AsTargetOffsets[transposedL2Dims[splitDim]] = zeroVal; + staticL2AsTargetSizes[transposedL2Dims[splitDim]] = oneVal; staticL3AsSourceSizes[splitDim] = - staticL2AsTargetSizes[transposeDim[nonSplitdim]]; + staticL2AsTargetSizes[transposedL2Dims[nonSplitdim]]; } } @@ -414,7 +409,7 @@ LogicalResult splitLogicalObjectFifos( // If the dma transpose is on the source(target) side, then the L2 // target(source) side has the sizes in order. SmallVector newL2Sizes = - dmaTransposeOnSource ? staticL2AsTargetSizes : staticL2AsSourceSizes; + dmaTransposeOnTarget ? staticL2AsSourceSizes : staticL2AsTargetSizes; AMDAIE::LogicalObjectFifoFromMemrefOp source = createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newL2Sizes); @@ -422,7 +417,7 @@ LogicalResult splitLogicalObjectFifos( // ---------- L3 -> L2 splitting -------------- // -------------------------------------------- // Update L3 source offsets for non-split dimensions. Refer doc comment of - // `updateL3SourceOffset` for the computation rationale involved. + // `addToOffset` for the computation rationale involved. SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); for (auto &&[splitDim, nonSplitdim] : @@ -445,9 +440,9 @@ LogicalResult splitLogicalObjectFifos( // If the dma transpose is on the target side, L3 source side data are // continuous and don't have `nonSplitDim`. - size_t dim = dmaTransposeOnSource ? nonSplitdim : splitDim; - FailureOr newOffset = updateL3SourceOffset( - rewriter, staticL3AsSourceOffsets[dim], offsetToAdd, context); + size_t dim = dmaTransposeOnTarget ? splitDim : nonSplitdim; + FailureOr newOffset = + addToOffset(rewriter, staticL3AsSourceOffsets[dim], offsetToAdd); if (failed(newOffset)) { // TODO: Ideally we should be able to handle even +, -, *, /, etc. // But handle this later (if at all!) as such cases might not @@ -507,4 +502,238 @@ LogicalResult splitLogicalObjectFifos( return success(); } +/// Utility to get the `DmaCpyNdOp` producers and consumers of a given +/// objectFifo op. +LogicalResult getDmaCpyNdOpProducersAndConsumers( + AMDAIE::LogicalObjectFifoFromMemrefOp op, + SmallVector &producers, + SmallVector &consumers) { + for (Operation *userOp : op->getUsers()) { + if (auto stridedCopyOp = dyn_cast(userOp)) { + if (dyn_cast_if_present( + stridedCopyOp.getTarget().getDefiningOp()) == op) { + producers.push_back(stridedCopyOp); + } else if (dyn_cast_if_present( + stridedCopyOp.getSource().getDefiningOp()) == op) { + consumers.push_back(stridedCopyOp); + } else { + return op.emitOpError() + << "has non-consumer, non-producer doubly strided copy op user"; + } + } else { + return op.emitOpError() << "has non-doubly strided copy op user"; + } + } + return success(); +} + +using OffsetIndexAndNewOffsetT = std::tuple, int64_t>; + +/// Utility to return the index of the offsets array that refers to newly +/// splitted objectFifo and the respective offset value. Note that there might +/// not be a dimension with `stride == sizeAfterSplit`, in which case an offset +/// index can't be returned and the correct offset is `0`. +FailureOr getOffsetIndexAndOffset( + ArrayRef offsets, ArrayRef sizes, + ArrayRef strides, size_t sizeAfterSplit, + function_ref emitError) { + SmallVector offsetIndices; + for (auto iter : llvm::enumerate(llvm::zip(strides, offsets))) { + std::optional maybeStride = + getConstantIntValue(std::get<0>(iter.value())); + std::optional maybeOffset = + getConstantIntValue(std::get<1>(iter.value())); + if (maybeStride.has_value() && maybeOffset.has_value() && + maybeStride.value() == sizeAfterSplit && maybeOffset.value() != 0) { + offsetIndices.push_back(iter.index()); + } + } + if (offsetIndices.size() > 1) + return emitError() << "multiple offset indices found"; + int64_t size{1}; + int64_t offset{0}; + std::optional maybeOffsetIdx; + if (offsetIndices.size() == 1) { + size_t offsetIdx = offsetIndices[0]; + maybeOffsetIdx = offsetIdx; + std::optional maybeSize = getConstantIntValue(sizes[offsetIdx]); + std::optional maybeOffset = + getConstantIntValue(offsets[offsetIdx]); + if (!maybeSize || !maybeOffset) { + return emitError() + << "expected a static target offset and size on index: " + << offsetIdx; + } + size = maybeSize.value(); + offset = maybeOffset.value(); + } + if (size != 1) { + return emitError() << "only a static size of 1 is currently " + "supported on the split index"; + } + return OffsetIndexAndNewOffsetT{maybeOffsetIdx, offset}; +} + +/// Split a logical objectFifo on the provided split dimension with the +/// specified splitting factor. +LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, + AMDAIE::LogicalObjectFifoFromMemrefOp op, + size_t splitDim, + std::optional maybeSplitFactor) { + SmallVector memrefShape = + llvm::to_vector(op.getMemrefType().getShape()); + int64_t splitFactor = maybeSplitFactor.has_value() ? maybeSplitFactor.value() + : memrefShape[splitDim]; + assert( + memrefShape[splitDim] % splitFactor == 0 && + "the target size for splitting is not divisible by the splitting factor"); + memrefShape[splitDim] /= splitFactor; + + // Create `splitFactor` number of objectFifo ops. + SmallVector newObjFifos; + newObjFifos.reserve(splitFactor); + for (int i = 0; i < splitFactor; i++) { + newObjFifos.push_back( + createNewLogicalObjectFifo(rewriter, op, memrefShape)); + } + + // Get the producers and consumers of the current objectFifoOp. + SmallVector producers; + SmallVector consumers; + if (failed(getDmaCpyNdOpProducersAndConsumers(op, producers, consumers))) { + return failure(); + } + + // The split offset is the + int64_t sizeAfterSplit = + std::accumulate(memrefShape.begin() + splitDim + 1, memrefShape.end(), 1, + std::multiplies<>()); + // Update the producer dma ops. + for (AMDAIE::DmaCpyNdOp producer : producers) { + SmallVector targetOffsets = producer.getTargetMixedOffsets(); + SmallVector targetSizes = producer.getTargetMixedSizes(); + SmallVector targetStrides = producer.getTargetMixedStrides(); + std::optional maybeOffsetIdx; + int64_t targetOffset{0}; + FailureOr maybeOffsetIdxAndNewOffset = + getOffsetIndexAndOffset(targetOffsets, targetSizes, targetStrides, + sizeAfterSplit, + [&]() { return producer.emitOpError(); }); + if (failed(maybeOffsetIdxAndNewOffset)) { + return producer.emitOpError() + << "failed to find an offset index and new offset"; + } + std::tie(maybeOffsetIdx, targetOffset) = maybeOffsetIdxAndNewOffset.value(); + assert(targetOffset < newObjFifos.size() && + "the targetOffset should be smaller than the number of objectFifos"); + if (maybeOffsetIdx.has_value()) + targetOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(0); + AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo = + newObjFifos[targetOffset]; + rewriter.setInsertionPoint(producer); + auto newDmaOp = rewriter.create( + producer.getLoc(), newObjFifo, targetOffsets, targetSizes, + targetStrides, producer.getSource(), producer.getSourceMixedOffsets(), + producer.getSourceMixedSizes(), producer.getSourceMixedStrides()); + rewriter.replaceOp(producer, newDmaOp); + } + + // Update the consumer dma ops. + for (AMDAIE::DmaCpyNdOp consumer : consumers) { + SmallVector sourceOffsets = consumer.getSourceMixedOffsets(); + SmallVector sourceSizes = consumer.getSourceMixedSizes(); + SmallVector sourceStrides = consumer.getSourceMixedStrides(); + std::optional maybeOffsetIdx; + int64_t sourceOffset{0}; + FailureOr maybeOffsetIdxAndNewOffset = + getOffsetIndexAndOffset(sourceOffsets, sourceSizes, sourceStrides, + sizeAfterSplit, + [&]() { return consumer.emitOpError(); }); + if (failed(maybeOffsetIdxAndNewOffset)) { + return consumer.emitOpError() + << "failed to find an offset index and offset"; + } + std::tie(maybeOffsetIdx, sourceOffset) = maybeOffsetIdxAndNewOffset.value(); + assert(sourceOffset < newObjFifos.size() && + "the sourceOffset should be smaller than the number of objectFifos"); + if (maybeOffsetIdx.has_value()) + sourceOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(0); + AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo = + newObjFifos[sourceOffset]; + rewriter.setInsertionPoint(consumer); + auto newDmaOp = rewriter.create( + consumer.getLoc(), consumer.getTarget(), + consumer.getTargetMixedOffsets(), consumer.getTargetMixedSizes(), + consumer.getTargetMixedStrides(), newObjFifo, sourceOffsets, + sourceSizes, sourceStrides); + rewriter.replaceOp(consumer, newDmaOp); + } + return success(); +} + +/// Split doubly strided operations on a source and target split dimension with +/// the provided split factor. +LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, + AMDAIE::DoublyStridedOpInterface op, + size_t sourceSplitDim, size_t targetSplitDim, + std::optional maybeSplitFactor) { + if (!op->use_empty()) + return op.emitOpError() << "can't be split because it has uses"; + SmallVector sourceOffsets = op.getSourceMixedOffsets(); + SmallVector sourceSizes = op.getSourceMixedSizes(); + SmallVector sourceStrides = op.getSourceMixedStrides(); + SmallVector targetOffsets = op.getTargetMixedOffsets(); + SmallVector targetSizes = op.getTargetMixedSizes(); + SmallVector targetStrides = op.getTargetMixedStrides(); + assert(sourceSplitDim < sourceOffsets.size() && + "the dimension to be split on should be smaller than the number of " + "source dimensions"); + assert(targetSplitDim < targetOffsets.size() && + "the dimension to be split on should be smaller than the number of " + "target dimensions"); + std::optional sourceSize = + getConstantIntValue(sourceSizes[sourceSplitDim]); + std::optional targetSize = + getConstantIntValue(targetSizes[targetSplitDim]); + if (!sourceSize) { + return op.emitOpError() + << "does not have a static source size on dim: " << sourceSplitDim; + } + if (!targetSize) { + return op.emitOpError() + << "does not have a static target size on dim: " << targetSplitDim; + } + int64_t splitFactor = maybeSplitFactor.has_value() + ? maybeSplitFactor.value() + : std::gcd(sourceSize.value(), targetSize.value()); + if (sourceSize.value() % splitFactor != 0 || + targetSize.value() % splitFactor != 0) { + return op.emitOpError() << "the target or source size is not divisible by " + "the provided splitting factor: " + << splitFactor; + } + int64_t newSourceSize = sourceSize.value() / splitFactor; + int64_t newTargetSize = targetSize.value() / splitFactor; + sourceSizes[sourceSplitDim] = rewriter.getIndexAttr(newSourceSize); + targetSizes[targetSplitDim] = rewriter.getIndexAttr(newTargetSize); + rewriter.setInsertionPoint(op); + for (int i = 0; i < splitFactor; ++i) { + FailureOr newSourceOffset = addToOffset( + rewriter, sourceOffsets[sourceSplitDim], newSourceSize); // i * + FailureOr newTargetOffset = addToOffset( + rewriter, targetOffsets[targetSplitDim], newTargetSize); // i * + if (failed(newSourceOffset)) + return op.emitOpError() << "could not create a new source offset"; + if (failed(newTargetOffset)) + return op.emitOpError() << "could not create a new target offset"; + op.createDoublyStridedOp(rewriter, targetOffsets, targetSizes, + targetStrides, sourceOffsets, sourceSizes, + sourceStrides); + sourceOffsets[sourceSplitDim] = newSourceOffset.value(); + targetOffsets[targetSplitDim] = newTargetOffset.value(); + } + rewriter.eraseOp(op); + return success(); +} + } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h index f9339b2ac..c470d917b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h @@ -11,14 +11,26 @@ namespace mlir::iree_compiler::AMDAIE { -/// Utility to help fetch those input DmaCpyNd Ops which needs to be split. -SmallVector fetchDmaCpyNdOpsToSplitOrCombine(Operation *op); - /// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops. -LogicalResult splitLogicalObjectFifos( +LogicalResult splitLogicalObjectFifoForElementwiseOp( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, MLIRContext *context); +/// Split a logical objectFifo on the provided split dimension with the +/// specified splitting factor. If no split factor is provided, the logical +/// objectFifo will be split on the size of the dimension being split. +LogicalResult splitLogicalObjectFifo( + IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromMemrefOp op, + size_t splitDim = 0, std::optional splitFactor = std::nullopt); + +/// Split doubly strided operations on a source and target split dimension with +/// the provided split factor. If no split factor is provided, the doubly +/// strided operation will be split on the size of the dimension being split. +LogicalResult splitDoublyStridedOp( + IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, + size_t sourceSplitDim = 0, size_t targetSplitDim = 0, + std::optional splitFactor = std::nullopt); + } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index db00f723b..a0f40c369 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -405,7 +405,7 @@ LogicalResult AIEDeviceBuilder::bufferToAIE(AMDAIE::BufferOp bufferOp, Block *deviceBlock, int &bufferId) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::BufferOp]\n"); OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); auto elemType = cast(bufferOp.getType()); Value tile = mapper.lookup(bufferOp.getTile()); auto aieBufferOp = rewriter.create( @@ -431,7 +431,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( AMDAIE::ConnectionOp connectionOp, Block *deviceBlock, int &connectionIndex) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n"); - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); SmallVector producerChannels; SmallVector consumerChannels; for (Value producerChannel : connectionOp.getSourceChannels()) { @@ -543,7 +543,8 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( } std::pair lockPair = std::make_pair(consumerLocks[0], producerLocks[0]); - rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); + rewriter.moveOpBefore(memOp, deviceBlock, + deviceBlock->without_terminator().end()); createDMA(memOp, AIE::DMAChannelDir::MM2S, channel.getValue(), dims, acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, lockPair, packetId); @@ -631,7 +632,8 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( } std::pair lockPair = std::make_pair(producerLocks[0], consumerLocks[0]); - rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end()); + rewriter.moveOpBefore(memOp, deviceBlock, + deviceBlock->without_terminator().end()); createDMA(memOp, AIE::DMAChannelDir::S2MM, channel.getValue(), dims, acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers, lockPair, packetId); @@ -649,7 +651,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( LogicalResult AIEDeviceBuilder::flowToAIE(AMDAIE::FlowOp flowOp, Block *deviceBlock) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n"); - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); SmallVector producerChannels; SmallVector consumerChannels; for (Value producerChannel : flowOp.getSources()) { @@ -671,7 +673,7 @@ LogicalResult AIEDeviceBuilder::flowToAIE(AMDAIE::FlowOp flowOp, consumerChannels.push_back(channelOp); } // Insert flow ops. - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); SmallVector flowOps = createFlowOps(flowOp, producerChannels, consumerChannels); return success(); @@ -681,7 +683,7 @@ LogicalResult AIEDeviceBuilder::lockToAIE(AMDAIE::LockOp lockOp, Block *deviceBlock, int &lockIndex) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LockOp]\n"); OpBuilder::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); Value tile = mapper.lookup(lockOp.getTile()); auto aieLockOp = rewriter.create( lockOp.getLoc(), tile, lockOp.getValueAttr(), lockOp.getInitValueAttr(), @@ -712,7 +714,7 @@ LogicalResult logicalObjFifoFromBuffersToMemOp( for (Value tile : logicalObjFifo.getTiles()) { if (tileToMemOpMap.contains(tile)) continue; Value aieTile = mapper.lookup(tile); - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); auto newMemOp = rewriter.create(rewriter.getUnknownLoc(), aieTile); rewriter.setInsertionPointToStart(&newMemOp.getRegion().emplaceBlock()); rewriter.create(rewriter.getUnknownLoc()); @@ -855,7 +857,7 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp, return WalkResult::advance(); }) .Default([&](Operation *op) { - rewriter.setInsertionPointToEnd(deviceBlock); + rewriter.setInsertionPoint(deviceBlock->getTerminator()); if (!isa_and_present(op->getDialect())) { rewriter.clone(*op, mapper); } else { @@ -868,7 +870,8 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp, if (res.wasInterrupted()) return failure(); // Merge core operations into end of the device block - rewriter.mergeBlocks(deviceCoreBlock, deviceBlock); + rewriter.inlineBlockBefore(deviceCoreBlock, deviceBlock, + deviceBlock->without_terminator().end()); return success(); } @@ -902,7 +905,9 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { auto deviceOp = rewriter.create( rewriter.getUnknownLoc(), xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), aieDevice)); - Block *deviceBlock = &deviceOp.getRegion().emplaceBlock(); + xilinx::AIE::DeviceOp::ensureTerminator(deviceOp.getRegion(), rewriter, + deviceOp.getLoc()); + Block *deviceBlock = deviceOp.getBody(); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); // Create aiex.runtime_sequence inside aie.device @@ -954,7 +959,8 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { } // Move NPU instruction function to the end of the device block. - rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end()); + rewriter.moveOpBefore(npuFuncOp, deviceBlock, + deviceBlock->without_terminator().end()); // After walking the FuncOp, it has been converted into a DeviceOp and can // safely be erased. eraseOp(funcOp); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp index 0a6118ab3..6cd8d86b4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp @@ -4,10 +4,9 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" -#include "mlir/IR/Iterators.h" #include "mlir/Pass/Pass.h" #define DEBUG_TYPE "iree-amdaie-peel-for-loop" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp new file mode 100644 index 000000000..83e29bffe --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -0,0 +1,222 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/IR/Iterators.h" +#include "mlir/Pass/Pass.h" + +#define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Utility struct to represent DMA split information. +struct DmaSplitInfo { + size_t sourceSplitDim{0}; + size_t targetSplitDim{0}; +}; + +using DmaObjFifoPairT = + std::pair; + +/// Find the logical objectFifo and DMA source/target splitting dimensions for +/// each DMA and objectFifo pair. +/// +/// Each pair is handled in the following way: +/// First, compute the objectFifo splitting dimension as the last non-unit shape +/// dimension. Afterwards, depending on which logical objectFifo is being +/// split on, find the outermost dimension in either the source or +/// target access pattern that has: +/// - stride == sizeAfterSplit +/// - size != 1 +/// This is the splitting dimension to be used on the respective side of the DMA +/// operation. Then, calculate the product size of that side of the DMA +/// operation after the splitting dimension and use it to calculate the +/// splitting dimension on the other side as the first dimension from the back +/// that has product size larger than the other side's product size after +/// splitting because that's the number of elements that should be +/// produced/consumed on the respective sides before splitting. +LogicalResult collectSplittingDims( + const SmallVector &dmaObjFifoPairs, + DenseMap &dmaSplitInfoMap, + DenseMap + &objFifoSplitDimMap) { + for (auto [dmaOp, objFifo] : dmaObjFifoPairs) { + LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n"); + LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n"); + ArrayRef memrefShape = objFifo.getMemrefType().getShape(); + if (llvm::any_of(memrefShape, [](int64_t size) { + return ShapedType::isDynamic(size); + })) { + return objFifo.emitOpError() + << "can't find a valid split dimension for dynamic sizes memref"; + } + auto iter = std::find_if(memrefShape.begin(), memrefShape.end(), + [](int64_t size) { return size > 1; }); + size_t objFifoSplitDim = std::distance(memrefShape.begin(), iter); + // If all dimensions are unit (1), no splitting can be done, so continue to + // the next pair. + if (objFifoSplitDim >= memrefShape.size()) continue; + int64_t sizeAfterSplit = + std::accumulate(memrefShape.begin() + objFifoSplitDim + 1, + memrefShape.end(), 1, std::multiplies<>()); + + size_t sourceSplitDim{0}; + size_t targetSplitDim{0}; + if (dmaOp.getTargetObjectFifo() == objFifo) { + std::optional> targetSizes = + getConstantIntValues(dmaOp.getTargetMixedSizes()); + std::optional> targetStrides = + getConstantIntValues(dmaOp.getTargetMixedStrides()); + std::optional> sourceSizes = + getConstantIntValues(dmaOp.getSourceMixedSizes()); + if (!targetSizes.has_value() || !targetStrides.has_value() || + !sourceSizes.has_value()) { + return dmaOp.emitOpError() << "has unsupported dynamic target strides " + "or sizes or source sizes"; + } + for (auto iter : llvm::enumerate( + llvm::zip(targetSizes.value(), targetStrides.value()))) { + int64_t size = std::get<0>(iter.value()); + int64_t stride = std::get<1>(iter.value()); + if (stride == sizeAfterSplit && size != 1) { + targetSplitDim = iter.index(); + break; + } + } + int64_t targetSizeAfterSplit = + std::accumulate(targetSizes.value().begin() + targetSplitDim + 1, + targetSizes.value().end(), 1, std::multiplies<>()); + SmallVector sourceProductSizes = sourceSizes.value(); + std::partial_sum(sourceProductSizes.rbegin(), sourceProductSizes.rend(), + sourceProductSizes.rbegin(), std::multiplies()); + for (int idx = sourceProductSizes.size() - 1; idx > 0; idx--) { + if (sourceProductSizes[idx] > targetSizeAfterSplit) { + sourceSplitDim = idx; + break; + } + } + } else if (dmaOp.getSourceObjectFifo() == objFifo) { + // Find outermost dimension in the access pattern that has stride == + // sizeAfterSplit and size != 1. + std::optional> sourceSizes = + getConstantIntValues(dmaOp.getSourceMixedSizes()); + std::optional> sourceStrides = + getConstantIntValues(dmaOp.getSourceMixedStrides()); + std::optional> targetSizes = + getConstantIntValues(dmaOp.getTargetMixedSizes()); + if (!sourceSizes.has_value() || !sourceStrides.has_value() || + !targetSizes.has_value()) { + return dmaOp.emitOpError() << "has unsupported dynamic source strides " + "or sizes or target sizes"; + } + for (auto iter : llvm::enumerate( + llvm::zip(sourceSizes.value(), sourceStrides.value()))) { + int64_t size = std::get<0>(iter.value()); + int64_t stride = std::get<1>(iter.value()); + if (stride == sizeAfterSplit && size != 1) { + sourceSplitDim = iter.index(); + break; + } + } + int64_t sourceRemainderSize = + std::accumulate(sourceSizes.value().begin() + sourceSplitDim + 1, + sourceSizes.value().end(), 1, std::multiplies<>()); + SmallVector targetProductSizes = targetSizes.value(); + std::partial_sum(targetProductSizes.rbegin(), targetProductSizes.rend(), + targetProductSizes.rbegin(), std::multiplies()); + for (int idx = targetProductSizes.size() - 1; idx > 0; idx--) { + if (targetProductSizes[idx] > sourceRemainderSize) { + targetSplitDim = idx; + break; + } + } + } + LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n"); + DmaSplitInfo dmaSplitInfo = {sourceSplitDim, targetSplitDim}; + dmaSplitInfoMap[dmaOp] = std::move(dmaSplitInfo); + objFifoSplitDimMap[objFifo] = objFifoSplitDim; + } + return success(); +} + +class AMDAIESplitLogicalObjFifosPass + : public impl::AMDAIESplitLogicalObjFifosBase< + AMDAIESplitLogicalObjFifosPass> { + public: + using AMDAIESplitLogicalObjFifosBase::AMDAIESplitLogicalObjFifosBase; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIESplitLogicalObjFifosPass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + MLIRContext *context = &getContext(); + IRRewriter rewriter(context); + + // Walk and collect all dma ops between L3 and L2. + SmallVector l3L2DmaOps; + SmallVector dmaObjFifoPairs; + WalkResult res = moduleOp->walk([&](AMDAIE::DmaCpyNdOp op) { + std::optional sourceMemSpace = op.getSourceMemorySpaceAsUInt(); + std::optional targetMemSpace = op.getTargetMemorySpaceAsUInt(); + if (!sourceMemSpace || !targetMemSpace) { + op.emitOpError() << "expected a source and target memory space"; + return WalkResult::interrupt(); + } + if (sourceMemSpace.value() == 1 && targetMemSpace.value() == 0) { + dmaObjFifoPairs.push_back({op, op.getSourceObjectFifo()}); + } else if (sourceMemSpace.value() == 0 && targetMemSpace.value() == 1) { + dmaObjFifoPairs.push_back({op, op.getTargetObjectFifo()}); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); + + // Collect the split dimensions for all DMA and ojectFifo pairs. + DenseMap dmaSplitInfoMap; + DenseMap objFifoSplitDimMap; + if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap, + objFifoSplitDimMap))) { + return signalPassFailure(); + } + + /// Split the DMA and objectFifo ops based on the calcuated splitting + /// dimensions. + for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) { + auto stridedOp = + cast(dmaOp.getOperation()); + if (failed(splitDoublyStridedOp(rewriter, stridedOp, + dmaSplitInfo.sourceSplitDim, + dmaSplitInfo.targetSplitDim))) { + LLVM_DEBUG(llvm::dbgs() + << "Failed to perform splitting of the DMA op: " << dmaOp); + return signalPassFailure(); + } + } + for (auto &&[objFifo, splitDim] : objFifoSplitDimMap) { + if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitDim))) { + LLVM_DEBUG(llvm::dbgs() + << "Failed to perform splitting of objectFifo op"); + return signalPassFailure(); + } + } +} + +} // namespace + +std::unique_ptr createAMDAIESplitLogicalObjFifosPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp index 4839246a4..f4ac4f9fd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp @@ -6,8 +6,6 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h" #include "iree-amd-aie/Transforms/Passes.h" -// #include "llvm/Support/Debug.h" -#include "mlir/IR/Iterators.h" #include "mlir/Pass/Pass.h" #define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos-for-connection-reuse" @@ -34,10 +32,24 @@ void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() { MLIRContext *context = &getContext(); IRRewriter rewriter(context); - SmallVector l2ToL1DmaOps = - fetchDmaCpyNdOpsToSplitOrCombine(moduleOp); + // Walk through CoreOps gathering 3rd input DmaOps (if applicable) which will + // be used to split L2 objectFifos of elementwise input for connection reuse. + SmallVector l2ToL1DmaOps; + WalkResult res = moduleOp->walk([&](AMDAIE::CoreOp coreOp) { + SmallVector inputDmas = coreOp.getInputDmas(); + if (inputDmas.size() != 3) return WalkResult::skip(); + auto dmaCpyNdOp = inputDmas[2].getDefiningOp(); + if (!dmaCpyNdOp) { + coreOp->emitOpError() << "failed to get a DmaCpyNdOp from the input"; + return WalkResult::interrupt(); + } + l2ToL1DmaOps.push_back(dmaCpyNdOp); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); - if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) { + if (failed(splitLogicalObjectFifoForElementwiseOp(rewriter, l2ToL1DmaOps, + context))) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of logicalobjectfifos"); return signalPassFailure(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp index ae6081349..2cd57beb2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp @@ -6,10 +6,10 @@ #include "AMDAIEUtils.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringExtras.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Iterators.h" namespace mlir::iree_compiler::AMDAIE { @@ -251,7 +251,7 @@ bool isMatmulInDefChain(Value operand) { /// Utility to identify if `linalgOp` is an elementwise operation with a /// matmul-like op upstream in its computation tree. bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp) { - if (!isElementwise(linalgOp) || isa(linalgOp)) { + if (!linalg::isElementwise(linalgOp) || isa(linalgOp)) { return false; } // Check if any of the defining op is a matmul-like op. @@ -272,6 +272,13 @@ std::string utohexstr(uint32_t value, size_t width, bool header, return res + prefix + hexStr; } +/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor. +Operation *getAncestorInBlock(Operation *op, Block *block) { + if (!op || !block) return nullptr; + while (op && (op->getBlock() != block)) op = op->getParentOp(); + return op; +} + /// Find the largest factor of 'num' which is not larger than 'max'. int detail::findLargestFactor(int num, int max) { assert(max > 0 && "No factors less than or equal to 0 exist"); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h index e78ebc281..3daac3cd7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h @@ -7,11 +7,9 @@ #ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIEUTILS_H_ #define IREE_AMD_AIE_TRANSFORMS_AMDAIEUTILS_H_ -#include - -#include "iree-amd-aie/IR/AMDAIEAttrs.h" -#include "iree/compiler/Dialect/HAL/IR/HALOps.h" -#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "iree-amd-aie/aie_runtime/AMDAIEEnums.h" +#include "iree/compiler/Dialect/HAL/IR/HALTypes.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/IR/Types.h" namespace mlir::iree_compiler::AMDAIE { @@ -62,15 +60,22 @@ bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp); std::string utohexstr(uint32_t value, size_t width, bool header = true, bool lowercase = false); +/// If `op` is in `block`, then return `op`. Otherwise traverse through parents +/// to the first ancestor of `op` that is in `block`, and return that +/// ancestor. If `op` has no ancestor in `block`, or if `op` is nullptr or +/// `block` is nullptr, return nullptr. +Operation *getAncestorInBlock(Operation *op, Block *block); + namespace detail { -// Returns the largest number that perfectly divides `num` that -// is less than or equal to max +/// Returns the largest number that perfectly divides `num` that +/// is less than or equal to max int findLargestFactor(int num, int max); -// A variant where we prefer factors to also be a multiple of `multiple` +/// A variant where we prefer factors to also be a multiple of `multiple` int findLargestFactor(int num, int max, int multiple); + } // namespace detail } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index c466003aa..e56c949d3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -52,12 +52,14 @@ iree_cc_library( "AMDAIEAssignLogicalObjectFifoDepth.cpp" "AMDAIEAssignNpuDmaBdIds.cpp" "AMDAIEAssignPacketIds.cpp" + "AMDAIEAssignTiles.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeNpuDmaCpyNd.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" "AMDAIECombineStridedOps.cpp" "AMDAIEConnectionToFlow.cpp" "AMDAIEConvertToDma.cpp" + "AMDAIEControlCodeForallToFor.cpp" "AMDAIEControlCodeLowering.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIEControlCodeToTransaction.cpp" @@ -101,6 +103,7 @@ iree_cc_library( "AMDAIEPropagateDataLayout.cpp" "AMDAIERemoveMemorySpace.cpp" "AMDAIESinkIntoCore.cpp" + "AMDAIESplitLogicalObjFifos.cpp" "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp" "AMDAIETemporaryAllocBufferization.cpp" "AMDAIETile.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp index 9f937b8b4..4d92fb59c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp @@ -241,10 +241,11 @@ FailureOr lowerUnPack( auto tileShape = srcShape.drop_front(destRank); // Append the inner tile shape to the permuted and rank-reduced outer shape. readShape.append(tileShape.begin(), tileShape.end()); - Type elemType = unPackOp.getInputType().getElementType(); - Attribute memorySpace = - cast(unPackOp.getInputType()).getMemorySpace(); - auto readType = MemRefType::get(readShape, elemType, nullptr, memorySpace); + MemRefType inputType = cast(unPackOp.getInputType()); + auto readType = + cast(memref::SubViewOp::inferRankReducedResultType( + readShape, inputType, readOffsets, readSizes, readStrides)); + tile = rewriter.create(loc, readType, input, readOffsets, readSizes, readStrides); perm = getPackUnpackNormalizedPerm(readType.getRank(), innerDimsPos); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 47cbc5dff..232a7b874 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -27,6 +27,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS #define GEN_PASS_DEF_AMDAIEASSIGNPACKETIDS +#define GEN_PASS_DEF_AMDAIEASSIGNTILES #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP @@ -34,6 +35,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECLEANUP #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS #define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW +#define GEN_PASS_DEF_AMDAIECONTROLCODEFORALLTOFOR #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL #define GEN_PASS_DEF_AMDAIECONTROLCODELOWERING #define GEN_PASS_DEF_AMDAIECONTROLCODETOTRANSACTION @@ -80,6 +82,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT #define GEN_PASS_DEF_AMDAIEREMOVEMEMORYSPACE #define GEN_PASS_DEF_AMDAIESINKINTOCORE +#define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOS #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE #define GEN_PASS_DEF_AMDAIETEMPORARYALLOCBUFFERIZATION #define GEN_PASS_DEF_AMDAIETILE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 21929a568..98297208e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -602,6 +602,13 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass()); + // Currently, SplitLogicalObjFifos pass only works for matmul-like ops. + if (useTilePipeline == TilePassPipeline::PackPeelPipeline) + passManager.addPass(createAMDAIESplitLogicalObjFifosPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); + + passManager.addPass(createAMDAIEAssignTilesPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); @@ -622,6 +629,12 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); + // Convert control code `scf.forall` ops to `scf.for` ops right before the DMA + // composition optimization pass to enable more loop subsumption optimization + // opportunities. + passManager.addPass(createAMDAIEControlCodeForallToForPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCompositionPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index aab36fd7c..3020dc969 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -93,6 +93,9 @@ std::unique_ptr createAMDAIEAssignNpuDmaBdIdsPass(); /// Create a pass to assign packet ids to `amdaie.flow` operations. std::unique_ptr createAMDAIEAssignPacketIdsPass(); +/// Create a pass to assign physical tile locations to logical objFifos. +std::unique_ptr createAMDAIEAssignTilesPass(); + /// Create a pass to do some rewrites that help bridging the path to AIR/AIE /// lowering. std::unique_ptr createAMDAIEBridgeToAIRPass(); @@ -112,6 +115,9 @@ std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass( /// Create pass to create `amdaie.flow` ops for connections. std::unique_ptr createAMDAIEConnectionToFlowPass(); +/// Pass to convert `scf.forall` to `scf.for` within `amdaie.controlcode`. +std::unique_ptr createAMDAIEControlCodeForallToForPass(); + /// Pass to unroll the loops within the control code regions. std::unique_ptr createAMDAIEControlCodeLoopUnrollPass(); @@ -276,6 +282,9 @@ std::unique_ptr createAMDAIERemoveMemorySpacePass(); /// Create a pass to sink all dependencies into `amdaie.core` operations. std::unique_ptr createAMDAIESinkIntoCorePass(); +/// Create a pass to split logicalobjectfifos for shimTile/memTile distribution. +std::unique_ptr createAMDAIESplitLogicalObjFifosPass(); + /// Create a pass to split logicalobjectfifos for connection reuse. std::unique_ptr createAMDAIESplitLogicalObjFifosForConnectionReusePass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 035cec0a1..9a4b2ce62 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -67,6 +67,12 @@ def AMDAIEAssignPacketIds : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignPacketIdsPass()"; } +def AMDAIEAssignTiles : Pass<"iree-amdaie-assign-tiles", ""> { + let summary = "Assign physical tile locations to logical objectFifos. " + "Existing assignments will be ignored/replaced."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignTilesPass()"; +} + def AMDAIEBridgeToAIR : Pass<"iree-amdaie-bridge-to-air", ""> { let summary = "Perform transformations that allow hooking into AIR/AIE lowering"; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEBridgeToAIRPass()"; @@ -147,6 +153,12 @@ def AMDAIEConnectionToFlow : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConnectionToFlowPass()"; } +def AMDAIEControlCodeForallToFor : + Pass<"iree-amdaie-controlcode-forall-to-for", ""> { + let summary = "Converts `scf.forall` to `scf.for` within `amdaie.controlcode`."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeForallToForPass()"; +} + def AMDAIEControlCodeLoopUnroll : Pass<"iree-amdaie-controlcode-loop-unroll", ""> { let summary = "Unroll the loops in the control code regions."; @@ -667,6 +679,25 @@ def AMDAIESinkIntoCore : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()"; } +def AMDAIESplitLogicalObjFifos : + Pass<"iree-amdaie-split-logical-objectfifos", "ModuleOp"> { + let summary = "Pass to split L2 buffers to distribute on multiple shimTiles and memTiles."; + let description = [{ + Splitting L2 input and output logical objectFifos and their user dma operations, + so that the logical objectFifos can be distributed on multiple shimTiles/memTiles. + Ideally, the split factor should only depend on the number of AIE columns being used, + however the current implementation only considers situations in which `nrows == ncols`. + + For example, for the case of a matmul C = A x B, the two outermost dimensions + of the L2 buffers are the implications of `nrows x ncols` AIE cores being used. + So if, A matrix is distributed on a 2x2 AIE array, with L2 buffer size + `[2, 1, 32, 32]`, will be split to two `[1, 1, 32, 32]` buffers. + Similarly, B matrix is distributed on a 2x2 AIE array with L2 buffer size + `[1, 2, 32, 32]`, will be split to two `[1, 1, 32, 32]` buffers. + }]; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosPass()"; +} + def AMDAIESplitLogicalObjFifosForConnectionReuse : Pass<"iree-amdaie-split-logical-objectfifos-for-connection-reuse", "ModuleOp"> { let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h index 10d444584..ca56f4446 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h @@ -14,6 +14,15 @@ namespace mlir::iree_compiler::AMDAIE { +/// Assign tile locations to the logical objectfifos with local memory space +/// (L1). +LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op); + +/// Assign tile locations to the logical objectfifos with non-local memory space +/// (L2, L3 etc, not L1). +LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op, + const AMDAIEDeviceModel &deviceModel); + /// Unroll the loops within the control code regions. LogicalResult controlCodeLoopUnroll(RewriterBase &rewriter, AMDAIE::ControlCodeOp controlCodeOp); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 2e5a400cf..151812b33 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -16,6 +16,7 @@ iree_lit_test_suite( "assign_logical_objectfifo_depth.mlir" "assign_npu_dma_bd_ids.mlir" "assign_packet_ids.mlir" + "assign_tiles.mlir" "bridge_to_air.mlir" "bufferize_to_allocation.mlir" "canonicalize_doubly_strided_op.mlir" @@ -23,6 +24,7 @@ iree_lit_test_suite( "canonicalize_npu_dma_cpy_nd.mlir" "combine_strided_ops.mlir" "connection_to_flow.mlir" + "controlcode_forall_to_for.mlir" "controlcode_loop_unrolling.mlir" "controlcode_lowering.mlir" "controlcode_to_transaction.mlir" @@ -73,6 +75,7 @@ iree_lit_test_suite( "propagate_data_layout.mlir" "remove_memory_space.mlir" "sink_into_core.mlir" + "split_logicalobjfifos.mlir" "split_logicalobjfifos_for_connection_reuse.mlir" "temporary_alloc_bufferization.mlir" "tile_and_fuse_using_scf_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir index 6ef0acc6b..d8d45da0f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir @@ -119,13 +119,13 @@ func.func @read_and_write(%arg0: !amdaie.logicalobjectfifo>, + %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile, in : [%connection], out : []) { + scf.for %arg = %c0 to %c32 step %c1 { + %access1= amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access1 : memref<32xi32, 2>) + } + %access2 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access2 : memref<32xi32, 2>) + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @loop_without_epilogue +// CHECK: amdaie.core +// CHECK-NEXT: acquire +// CHECK-NEXT: access +// CHECK-NEXT: linalg.fill +// CHECK-NEXT: logicalobjectfifo.release +// CHECK-NEXT: scf.for +// CHECK-SAME: { +// CHECK-NEXT: acquire +// CHECK-NEXT: access +// CHECK-NEXT: linalg.fill +// CHECK-NEXT: logicalobjectfifo.release +// CHECK-SAME: {size = 1 : i32} +// CHECK-NEXT: } +func.func @loop_without_epilogue(%arg0: !amdaie.logicalobjectfifo>, + %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile, in : [%connection], out : []) { + %access1 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access1 : memref<32xi32, 2>) + scf.for %arg = %c0 to %c32 step %c1 { + %access2 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access2 : memref<32xi32, 2>) + } + amdaie.end + } + return +} + +// ----- + +// Test of the case +// +// for ... { +// access +// for ... { +// access +// } +// } +// +// with expected result +// +// for ... { +// acquire +// release +// for ... { +// acquire +// release +// } +// } + +// CHECK-LABEL: @nested_for_loops +// CHECK: amdaie.core +// CHECK: scf.for +// CHECK: acquire +// CHECK: access +// CHECK: linalg.fill +// CHECK: logicalobjectfifo.release +// CHECK: scf.for +// CHECK: acquire +// CHECK: access +// CHECK: linalg.fill +// CHECK: logicalobjectfifo.release +// CHECK: } +// CHECK: } +// CHECK: amdaie.end +func.func @nested_for_loops(%arg0: !amdaie.logicalobjectfifo>, + %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %tile = amdaie.tile(%c0, %c0) + %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile, in : [%connection], out : []) { + scf.for %arg_0 = %c0 to %c4 step %c1 { + %access2 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0 : index) outs(%access2 : memref<32xi32, 2>) + scf.for %arg_1 = %c0 to %c8 step %c1 { + %access3 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0 : index) outs(%access3 : memref<32xi32, 2>) + } + } + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @epilogue_write_with_preceding_none_accesses +// CHECK: amdaie.core +// CHECK-NEXT: acquire +// CHECK-SAME: Produce +// CHECK-NEXT: access +// CHECK-SAME: Write +// CHECK-NEXT: linalg.fill +// CHECK-NEXT: scf.for +// CHECK-SAME: { +// CHECK-NEXT: linalg.fill +// CHECK-NEXT: } +// CHECK-NEXT: linalg.fill +// CHECK-NEXT: logicalobjectfifo.release +// CHECK-SAME: Produce +// CHECK-NEXT: amdaie.end + +// With the current implementation, the acquire for Write access is inserted +// before the very first access of the objectfifo, which in this case is on a +// None access. +func.func @epilogue_write_with_preceding_none_accesses( + %arg0: !amdaie.logicalobjectfifo>, + %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile, in : [%connection], out : []) { + %access1 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access1 : memref<32xi32, 2>) + scf.for %arg = %c0 to %c32 step %c1 { + %access2 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access2 : memref<32xi32, 2>) + } + %access3 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo> -> memref<32xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%access3 : memref<32xi32, 2>) + amdaie.end + } + return +} + diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir new file mode 100644 index 000000000..10bd42978 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir @@ -0,0 +1,360 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-assign-tiles,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// Test assignment of L1 objFifos based on the cores where they are used. +// CHECK-LABEL: @assign_local_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 2> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_local_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %alloc = memref.alloc() : memref<1024xi32, 2> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_0_3 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.core(%tile_0_2, in : [], out : []) { + %3 = amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + %5 = amdaie.core(%tile_0_3, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<1024xi32, 2> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L2 objFifos based on L1 assignments. +// CHECK-LABEL: @assign_l2_l1_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32, 1> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l2_l1_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %alloc = memref.alloc() : memref<2048xi32, 1> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_0_3 = amdaie.tile(%c0, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile_0_2, in : [], out : []) { + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + %5 = amdaie.core(%tile_0_3, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32, 1> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L2 objFifos onto different columns. +// CHECK-LABEL: @assign_l2_tiles_on_diff_cols +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 1> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l2_tiles_on_diff_cols() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<1024xi32, 1> + %alloc_1 = memref.alloc() : memref<2048xi32, 1> + %alloc_2 = memref.alloc() : memref<1024xi32, 2> + %alloc_3 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %4 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.dma_cpy_nd(%3[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.core(%tile_0_2, in : [], out : []) { + %7 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + amdaie.end + } + %8 = amdaie.core(%tile_1_2, in : [], out : []) { + %9 = amdaie.logicalobjectfifo.access(%3, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<1024xi32, 1> + memref.dealloc %alloc_1 : memref<2048xi32, 1> + memref.dealloc %alloc_2 : memref<1024xi32, 2> + memref.dealloc %alloc_3 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L3 and L2 objFifos based on L1 assignments. +// CHECK-LABEL: @assign_l3_l2_l1_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l3_l2_l1_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<2048xi32> + %alloc_1 = memref.alloc() : memref<2048xi32, 1> + %alloc_2 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.core(%tile_0_2, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32> + memref.dealloc %alloc_1 : memref<2048xi32, 1> + memref.dealloc %alloc_2 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L3 objFifos based on L1 assignments. +// CHECK-LABEL: @assign_l3_l1_tiles +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l3_l1_tiles() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<2048xi32> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.core(%tile_0_2, in : [], out : []) { + %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test assignment of L3 objFifos onto different columns. +// CHECK-LABEL: @assign_l3_tiles_on_diff_cols +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_0]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @assign_l3_tiles_on_diff_cols() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<1024xi32> + %alloc_1 = memref.alloc() : memref<2048xi32> + %alloc_2 = memref.alloc() : memref<1024xi32, 1> + %alloc_3 = memref.alloc() : memref<2048xi32, 1> + %alloc_4 = memref.alloc() : memref<1024xi32, 2> + %alloc_5 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%4[] [] [], %3[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%5[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.core(%tile_0_2, in : [], out : []) { + %11 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<1024xi32, 2> + amdaie.end + } + %12 = amdaie.core(%tile_1_2, in : [], out : []) { + %13 = amdaie.logicalobjectfifo.access(%5, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<1024xi32> + memref.dealloc %alloc_1 : memref<2048xi32> + memref.dealloc %alloc_2 : memref<1024xi32, 1> + memref.dealloc %alloc_3 : memref<2048xi32, 1> + memref.dealloc %alloc_4 : memref<1024xi32, 2> + memref.dealloc %alloc_5 : memref<2048xi32, 2> + return + } +} + +// ----- + +// Test duplicate global logical objectFifos (L3). +// CHECK-LABEL: @duplicate_global_object_fifos +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2> +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} +// CHECK-DAG: amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]]} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @duplicate_global_object_fifos() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<2048xi32> + %alloc_1 = memref.alloc() : memref<2048xi32, 2> + %alloc_2 = memref.alloc() : memref<2048xi32, 2> + amdaie.workgroup { + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%2[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.core(%tile_0_2, in : [], out : []) { + %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + %7 = amdaie.core(%tile_1_2, in : [], out : []) { + %8 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo> -> memref<2048xi32, 2> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + memref.dealloc %alloc : memref<2048xi32> + memref.dealloc %alloc_1 : memref<2048xi32, 2> + memref.dealloc %alloc_2 : memref<2048xi32, 2> + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_forall_to_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_forall_to_for.mlir new file mode 100644 index 000000000..02f5e5120 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_forall_to_for.mlir @@ -0,0 +1,132 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-controlcode-forall-to-for,canonicalize)" --split-input-file %s | FileCheck %s + +// CHECK-LABEL: @test_promotion +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: amdaie.controlcode +// CHECK: func.call @callee(%[[C0]], %[[C0]]) : (index, index) -> () +module @test_promotion { + func.func private @callee(%i: index, %j: index) + amdaie.workgroup { + amdaie.controlcode { + scf.forall (%i, %j) in (1, 1) { + func.call @callee(%i, %j) : (index, index) -> () + } + amdaie.end + } + } +} + +// ----- + +// CHECK-LABEL: @test_single +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: amdaie.controlcode +// CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: func.call @callee(%[[ARG0]], %[[ARG1]]) : (index, index) -> () +module @test_single { + func.func private @callee(%i: index, %j: index) + amdaie.workgroup { + amdaie.controlcode { + scf.forall (%i, %j) in (2, 3) { + func.call @callee(%i, %j) : (index, index) -> () + } + amdaie.end + } + } +} + +// ----- + +// CHECK-LABEL: @test_multi +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: amdaie.controlcode +// CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: func.call @callee(%[[ARG0]], %[[ARG1]]) : (index, index) -> () +// CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: func.call @callee(%[[ARG0]], %[[ARG1]]) : (index, index) -> () +module @test_multi { + func.func private @callee(%i: index, %j: index) + amdaie.workgroup { + amdaie.controlcode { + scf.forall (%i, %j) in (3, 2) { + func.call @callee(%i, %j) : (index, index) -> () + } + scf.forall (%i, %j) in (2, 3) { + func.call @callee(%i, %j) : (index, index) -> () + } + amdaie.end + } + } +} + +// ----- + +// CHECK-LABEL: @test_nested +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: amdaie.controlcode +// CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: func.call @callee(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]) : (index, index, index, index) -> () +module @test_nested { + func.func private @callee(%i: index, %j: index, %k: index, %l: index) + amdaie.workgroup { + amdaie.controlcode { + scf.forall (%i, %j) in (2, 3) { + scf.forall (%k, %l) in (2, 3) { + func.call @callee(%i, %j, %k, %l) : (index, index, index, index) -> () + } + } + amdaie.end + } + } +} + +// ----- + +// CHECK-LABEL: @test_affine_apply +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: amdaie.controlcode +// CHECK: scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: %[[APPLY0:.+]] = affine.apply #map(%[[ARG0]]) +// CHECK: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: %[[APPLY1:.+]] = affine.apply #map(%[[ARG1]]) +// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C1]] { +// CHECK: %[[APPLY2:.+]] = affine.apply #map(%[[ARG2]]) +// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C3]] step %[[C1]] { +// CHECK: %[[APPLY3:.+]] = affine.apply #map(%[[ARG3]]) +// CHECK: func.call @callee(%[[APPLY0]], %[[APPLY1]], %[[APPLY2]], %[[APPLY3]]) : (index, index, index, index) -> () +#map = affine_map<(d0) -> (d0 * 32)> +module @test_affine_apply { + func.func private @callee(%i: index, %j: index, %k: index, %l: index) + amdaie.workgroup { + amdaie.controlcode { + scf.forall (%i, %j) in (2, 3) { + scf.forall (%k, %l) in (2, 3) { + %0 = affine.apply #map(%i) + %1 = affine.apply #map(%j) + %2 = affine.apply #map(%k) + %3 = affine.apply #map(%l) + func.call @callee(%0, %1, %2, %3) : (index, index, index, index) -> () + } + } + amdaie.end + } + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index bfe6dc456..f7704d4db 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -100,7 +100,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000028 // CHECK: 0x00000000 // CHECK: 0x00000000 -// CHECK: 0x0001D21C +// CHECK: 0x0601D21C // CHECK: 0x00000000 // CHECK: 0x803F0002 // CHECK: 0x00000018 @@ -127,7 +127,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000038 // CHECK: 0x00000000 // CHECK: 0x00000000 -// CHECK: 0x0001D214 +// CHECK: 0x0401D214 // CHECK: 0x00000000 // CHECK: 0x80FF000F // CHECK: 0x00000018 diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index 3a9e583bc..57a15c673 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -1,5 +1,19 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s +// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + // Check for unrolling an amdaie.core within a parallel loop with a single // induction variable with multiple iterations. There are no dma ops in this // check. @@ -18,7 +32,8 @@ // CHECK: %{{.*}} = amdaie.core(%[[TILE_2]], in : [], out : []) // CHECK: %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]]) // CHECK: %{{.*}} = amdaie.core(%[[TILE_3]], in : [], out : []) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos_1x4() { %c2 = arith.constant 2 : index scf.forall (%arg0, %arg1) in (1, 1) { @@ -50,7 +65,8 @@ module { // CHECK-DAG: %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]], in : [], out : []) // CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) // CHECK-DAG: %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]], in : [], out : []) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos_2x2() { scf.forall (%arg0, %arg1) in (1, 1) { scf.forall (%arg2, %arg3) in (2, 2) { @@ -92,7 +108,8 @@ module { // CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : []) // CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @unroll_dma() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -142,7 +159,8 @@ module { // CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_single_loop() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -196,7 +214,8 @@ module { // CHECK-DAG: amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : []) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_and_affine_single_loop_2x1() { %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<32x1024xi32, 1> @@ -251,7 +270,8 @@ module { // CHECK-DAG: amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @unroll_dma_and_affine_single_loop() { %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<32x1024xi32, 1> @@ -308,7 +328,8 @@ module { // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_0]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_multi_loop() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -367,7 +388,8 @@ module { // CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_1]]], out : []) // CHECK-DAG: amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_one_of_multi_loop() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -440,7 +462,8 @@ module { // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_3]]], out : []) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @hoist_dma_dependencies() { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index @@ -491,6 +514,8 @@ module { // CHECK-DAG: %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]]) // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) // CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} // CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} // CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_1_3]]} @@ -499,8 +524,8 @@ module { // CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]]} // CHECK-DAG: %[[FROM_MEMREF_6:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_3]]} // CHECK-DAG: %[[FROM_MEMREF_7:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_3]]} -// CHECK-DAG: %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_1_0]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]] // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]] // CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]] @@ -529,7 +554,8 @@ module { // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) // CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]] -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @nested_dma_dependencies() { %c0_i32 = arith.constant 0 : i32 %c1 = arith.constant 1 : index @@ -589,23 +615,26 @@ module { // CHECK: linalg.fill ins(%[[C0]] : i32) outs(%[[ACCESS]] : memref<1x1x8x8x4x4xi32, 2 : i32>) // CHECK: amdaie.end // CHECK: memref.dealloc %[[ALLOC]] : -func.func @l1_temporary_buffer_for_matmul_elem() { - %c0_i32 = arith.constant 0 : i32 - %c2 = arith.constant 2 : index - %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> - scf.forall (%arg0, %arg1) in (1, 1) { - scf.forall (%arg2, %arg3) in (1, 1) { - %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32> - %26 = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %26) - %27 = amdaie.core(%tile, in : [], out : []) { - linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> - return +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @l1_temporary_buffer_for_matmul_elem() { + %c0_i32 = arith.constant 0 : i32 + %c2 = arith.constant 2 : index + %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + scf.forall (%arg0, %arg1) in (1, 1) { + scf.forall (%arg2, %arg3) in (1, 1) { + %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32> + %26 = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %26) + %27 = amdaie.core(%tile, in : [], out : []) { + linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>) + amdaie.end + } + } {mapping = [#gpu.thread, #gpu.thread]} + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32> + return + } } // ----- @@ -618,19 +647,22 @@ func.func @l1_temporary_buffer_for_matmul_elem() { // CHECK-SAME: to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2> // CHECK-NOT: memref.subview // CHECK: return -func.func @not_distributable() { - %cst = arith.constant 0.000000e+00 : bf16 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %alloc = memref.alloc() : memref<2x2x100xbf16, 2> - scf.forall (%arg0, %arg1) in (2, 2) { - scf.for %arg2 = %c0 to %c4 step %c1 { - %subview = memref.subview %alloc[%arg0, %arg1, %arg2] [1, 1, 10] [1, 1, 1] : memref<2x2x100xbf16, 2> to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2> - linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>) - } - } {mapping = [#gpu.thread, #gpu.thread]} - return +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @not_distributable() { + %cst = arith.constant 0.000000e+00 : bf16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %alloc = memref.alloc() : memref<2x2x100xbf16, 2> + scf.forall (%arg0, %arg1) in (2, 2) { + scf.for %arg2 = %c0 to %c4 step %c1 { + %subview = memref.subview %alloc[%arg0, %arg1, %arg2] [1, 1, 10] [1, 1, 1] : memref<2x2x100xbf16, 2> to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2> + linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>) + } + } {mapping = [#gpu.thread, #gpu.thread]} + return + } } @@ -652,6 +684,7 @@ func.func @not_distributable() { // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%c0, %c1) // CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%c1, %c1) // CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%c0, %c0) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%c1, %c0) // CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]} // CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_1]]} // CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_1]]} @@ -663,7 +696,7 @@ func.func @not_distributable() { // CHECK-DAG: %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_1_2]]} // CHECK-DAG: %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_0_2]]} // CHECK-DAG: %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_0_0]]} -// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_1_0]]} // CHECK-DAG: %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_0_0]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]] // CHECK-SAME: %[[FROM_MEMREF_11]] @@ -702,7 +735,8 @@ func.func @not_distributable() { #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> #map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos() { %c2 = arith.constant 2 : index %c1024 = arith.constant 1024 : index @@ -818,7 +852,8 @@ module { // CHECK-DAG: vector.transfer_write %[[CONTRACT]], %[[VAL_2]] // CHECK-DAG-SAME: [%[[C0]], %[[C0]], %[[ARG3]], %[[ARG2]], %[[C0]], %[[C0]]] // CHECK-DAG-SAME: in_bounds = [true, true, true, true, true, true] -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @distribute_cores_and_objectfifos_vectorization() { %c192 = arith.constant 192 : index %c32 = arith.constant 32 : index @@ -918,7 +953,8 @@ module { // CHECK-DAG: func.call @matmul_i32_i32 // CHECK-DAG: amdaie.end // CHECK-DAG: } {elf_file = "/path/to/ukernel.o"} -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func private @matmul_i32_i32(memref, index, memref, index, memref, index) attributes {link_with = "/path/to/ukernels.o", llvm.bareptr = true} func.func @distribute_cores_and_objectfifos_ukernel() { %c64 = arith.constant 64 : index @@ -987,7 +1023,8 @@ module { // CHECK-SAME: ins(%[[ACCESS_1]] : memref<4x4xi32, 2 : i32>) outs(%[[SUBVIEW:.*]] : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) { #map = affine_map<(d0, d1) -> (d0, d1)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @mixed_alloc_subview_operands() { %c2 = arith.constant 2 : index %c0_i32 = arith.constant 0 : i32 diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir index 1d3c38d7a..efbd2b931 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir @@ -90,4 +90,25 @@ func.func @non_distributing_subview(%index : index) { } +// ----- + +// Example where the subview type is a complete view of the alloc: unchanged IR. +// CHECK-LABEL: @complete_view_subview +// CHECK-NEXT: memref.alloc() : memref<4xbf16, 2> +// CHECK-NEXT: scf.forall +// CHECK-NEXT: arith.constant +// CHECK-NEXT: memref.subview +// CHECK-NEXT: linalg.fill +// CHECK-NEXT: mapping = [#gpu.thread] +// CHECK-NEXT: return + +func.func @complete_view_subview() { + %alloc = memref.alloc() : memref<4xbf16, 2> + scf.forall (%arg0) in (4) { + %c0_bf16 = arith.constant 0.000000e+00 : bf16 + %subview = memref.subview %alloc[0] [4] [1] : memref<4xbf16, 2> to memref<4xbf16, 2> + linalg.fill ins(%c0_bf16 : bf16) outs(%subview : memref<4xbf16, 2>) + } {mapping = [#gpu.thread]} + return +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir index 7f0ae6d13..f266e861d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir @@ -64,9 +64,8 @@ func.func @func3() { // CHECK-LABEL: @func4 // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<1x1x8x16xi32, 1> // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<8x16xi32> -// CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, 1> -// CHECK: %[[TRANSPOSE0:.*]] = memref.transpose %[[SUBVIEW0]] (d0, d1) -> (d0, d1) : memref<8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1> -// CHECK: air.dma_memcpy_nd (%[[ALLOC1]][] [] [], %[[TRANSPOSE0]][] [] []) : (memref<8x16xi32>, memref<8x16xi32, strided<[16, 1]>, 1>) +// CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1> +// CHECK: air.dma_memcpy_nd (%[[ALLOC1]][] [] [], %[[SUBVIEW0]][] [] []) : (memref<8x16xi32>, memref<8x16xi32, strided<[16, 1]>, 1>) func.func @func4() { %alloc = memref.alloc() : memref<1x1x8x16xi32, 1> %alloc_0 = memref.alloc() : memref<8x16xi32> @@ -79,9 +78,8 @@ func.func @func4() { // CHECK: scf.parallel (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) = // CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][%[[ARG0]], %[[ARG1]], %[[ARG2]]] [1, 8, 64] [1, 1, 1] : memref<32x8x64xf32> to memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>> // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<1x1x1x8x64xf32, 1> -// CHECK: %[[SUBVIEW1:.*]] = memref.subview %[[ALLOC1]][0, 0, 0, 0, 0] [1, 1, 1, 8, 64] [1, 1, 1, 1, 1] : memref<1x1x1x8x64xf32, 1> to memref<1x8x64xf32, 1> -// CHECK: %[[TRANSPOSE0:.*]] = memref.transpose %[[SUBVIEW1]] (d0, d1, d2) -> (d0, d1, d2) : memref<1x8x64xf32, 1> to memref<1x8x64xf32, strided<[512, 64, 1]>, 1> -// CHECK: air.dma_memcpy_nd (%[[SUBVIEW0]][] [] [], %[[TRANSPOSE0]][] [] []) : (memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>>, memref<1x8x64xf32, strided<[512, 64, 1]>, 1>) +// CHECK: %[[SUBVIEW1:.*]] = memref.subview %[[ALLOC1]][0, 0, 0, 0, 0] [1, 1, 1, 8, 64] [1, 1, 1, 1, 1] : memref<1x1x1x8x64xf32, 1> to memref<1x8x64xf32, strided<[512, 64, 1]>, 1> +// CHECK: air.dma_memcpy_nd (%[[SUBVIEW0]][] [] [], %[[SUBVIEW1]][] [] []) : (memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>>, memref<1x8x64xf32, strided<[512, 64, 1]>, 1>) func.func @func5() { %c0 = arith.constant 0 : index @@ -164,9 +162,8 @@ func.func @func6() { memref.dealloc %alloc_9 : memref<1x1x2x2x4x8xi32, 2> scf.reduce } - // CHECK: %[[SUBVIEW8:.*]] = memref.subview %{{.*}}[0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, 1> - // CHECK: %[[TRANSPOSE5:.*]] = memref.transpose %[[SUBVIEW8]] (d0, d1) -> (d0, d1) : memref<8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1> - // CHECK: air.dma_memcpy_nd (%[[SUBVIEW2]][] [] [], %[[TRANSPOSE5]][] [] []) : (memref<8x16xi32, strided<[32, 1], offset: ?>>, memref<8x16xi32, strided<[16, 1]>, 1>) + // CHECK: %[[SUBVIEW8:.*]] = memref.subview %{{.*}}[0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1> + // CHECK: air.dma_memcpy_nd (%[[SUBVIEW2]][] [] [], %[[SUBVIEW8]][] [] []) : (memref<8x16xi32, strided<[32, 1], offset: ?>>, memref<8x16xi32, strided<[16, 1]>, 1>) iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %subview_1 : (memref<1x1x8x16xi32, 1> memref<8x16xi32, strided<[32, 1], offset: ?>>) memref.dealloc %alloc_2 : memref<1x1x16x16xi32, 1> memref.dealloc %alloc : memref<1x1x8x16xi32, 1> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir new file mode 100644 index 000000000..cb087421e --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -0,0 +1,159 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// Test of splitting matmul lhs input objectFifo and dma operations. + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 * 64)> +// CHECK-label: func.func @split_L2_input_lhs +// CHECK-DAG: %[[ALLOC_A0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[ALLOC_A1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK: %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A0]], {} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A1]], {} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #[[MAP1]](%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #[[MAP0]](%[[IV0]]) +// CHECK: %[[DMA_L3_TO_L2_A0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_A0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1] +// CHECK-SAME: {{.*}}[%[[IV0_0:.*]], 0] [32, 32] [128, 1] +// CHECK: %[[DMA_L3_TO_L2_A1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_A1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1] +// CHECK-SAME: {{.*}}[%[[IV0_32:.*]], 0] [32, 32] [128, 1] +// CHECK: %[[DMA_L2_TO_L1_A0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1] +// CHECK-SAME: %[[OBJ_L2_A0]][0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1] +// CHECK: %[[DMA_L2_TO_L1_A1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1] +// CHECK-SAME: %[[OBJ_L2_A1]][0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1] +// CHECK: memref.dealloc %[[ALLOC_A0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[ALLOC_A1]] : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +module { + func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) { + %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg1, %arg2) in (2, 2) { + %2 = affine.apply #map(%arg1) + %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %0[%2, 0] [64, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> + return + } +} + +// ----- + +// Test of splitting matmul rhs input objectFifo and dma operations. + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 * 64)> +// CHECK-label: func.func @split_L2_input_rhs +// CHECK-DAG: %[[ALLOC_B0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[ALLOC_B1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK: %[[OBJ_L2_B0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_B0]], {} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_L2_B1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_B1]], {} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #[[MAP1]](%[[IV1]]) +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #[[MAP0]](%[[IV1]]) +// CHECK: %[[DMA_L3_TO_L2_B0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_B0]][0, 0, 0, 0] [1, 32, 1, 32] [2048, 32, 1024, 1] +// CHECK-SAME: {{.*}}[0, %[[IV1_0:.*]]] [32, 32] [128, 1] +// CHECK: %[[DMA_L3_TO_L2_B1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_B1]][0, 0, 0, 0] [1, 32, 1, 32] [2048, 32, 1024, 1] +// CHECK-SAME: {{.*}}[0, %[[IV1_32:.*]]] [32, 32] [128, 1] +// CHECK: %[[DMA_L2_TO_L1_B0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1] +// CHECK-SAME: %[[OBJ_L2_B0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK: %[[DMA_L2_TO_L1_B1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1] +// CHECK-SAME: %[[OBJ_L2_B1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK: memref.dealloc %[[ALLOC_B0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[ALLOC_B1]] : memref<1x1x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +module { + func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) { + %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg1, %arg2) in (2, 2) { + %2 = affine.apply #map(%arg2) + %3 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1], %1[0, %2] [32, 64] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> + return + } +} + +// ----- + +// Test of splitting matmul output objectFifo and dma operations. + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK-label: func.func @split_L2_output +// CHECK-DAG: %[[ALLOC_C0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK-DAG: %[[ALLOC_C1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> +// CHECK: %[[OBJ_L2_C0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_C0]], {} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_L2_C1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_C1]], {} : +// CHECK-SAME: memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #[[MAP0]](%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #[[MAP0]](%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #[[MAP1]](%[[IV0]]) +// CHECK: %[[DMA_L1_TO_L2_C0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_C0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1] +// CHECK: %[[DMA_L1_TO_L2_C1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_C0]][0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1] +// CHECK: %[[DMA_L1_TO_L2_C3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_C1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1] +// CHECK: %[[DMA_L1_TO_L2_C4:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[OBJ_L2_C1]][0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1] +// CHECK: %[[DMA_L2_TO_L3_C0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: {{.*}}[%[[IV0_0:.*]], %[[IV1_0:.*]]] [32, 64] [128, 1] +// CHECK-SAME: %[[OBJ_L2_C0]][0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1] +// CHECK: %[[DMA_L2_TO_L3_C1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: {{.*}}[%[[IV0_32:.*]], %[[IV1_0:.*]]] [32, 64] [128, 1] +// CHECK-SAME: %[[OBJ_L2_C1]][0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1] +// CHECK: memref.dealloc %[[ALLOC_C0]] : memref<1x2x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[ALLOC_C1]] : memref<1x2x32x32xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +module { + func.func @split_L2_output(%arg0: memref<128x128xi32>) { + %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg1, %arg2) in (2, 2) { + %2 = affine.apply #map(%arg2) + %3 = affine.apply #map(%arg1) + %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32> + return + } +} diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc index c3dc89afe..3a654f27a 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc @@ -486,7 +486,8 @@ std::optional> Router::findPaths( auto curr = endPoint; // trace backwards until a vertex already processed is reached while (!processed.count(curr)) { - auto &sb = impl->graph[std::make_pair(preds[curr].tileLoc, curr.tileLoc)]; + auto &sb = + impl->graph[std::make_pair(preds[curr].tileLoc, curr.tileLoc)]; size_t i = std::distance(sb.srcPorts.begin(), std::find(sb.srcPorts.begin(), sb.srcPorts.end(), @@ -870,7 +871,8 @@ FailureOr> emitPacketRoutingConfiguration( MasterSetsT mastersets; for (const auto &[physPort, ports] : masterAMSels) { for (Port port : ports) { - mastersets[PhysPort{physPort.first, port}].push_back(physPort.second); + mastersets[PhysPort{physPort.first, port, PhysPort::Direction::DST}] + .push_back(physPort.second); } } @@ -883,6 +885,14 @@ FailureOr> emitPacketRoutingConfiguration( /// ================== stringification utils ============================= /// ====================================================================== +std::string to_string(const PhysPort::Direction &direction) { + switch (direction) { + STRINGIFY_ENUM_CASE(PhysPort::Direction::SRC) + STRINGIFY_ENUM_CASE(PhysPort::Direction::DST) + } + llvm::report_fatal_error("Unhandled PhysPortDirection case"); +} + std::string to_string(const SwitchSetting &setting) { return "SwitchSetting(" + llvm::join( @@ -913,7 +923,7 @@ std::string to_string(const SwitchSettings &settings) { STRINGIFY_2TUPLE_STRUCT(Port, bundle, channel) STRINGIFY_2TUPLE_STRUCT(Connect, src, dst) STRINGIFY_2TUPLE_STRUCT(PathEndPoint, tileLoc, port) -STRINGIFY_2TUPLE_STRUCT(PhysPort, tileLoc, port) +STRINGIFY_3TUPLE_STRUCT(PhysPort, tileLoc, port, direction) STRINGIFY_2TUPLE_STRUCT(PhysPortAndID, physPort, id) BOTH_OSTREAM_OPS_FORALL_ROUTER_TYPES(OSTREAM_OP_DEFN, BOTH_OSTREAM_OP) diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h index b30a1a147..449eb6ca2 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h @@ -23,7 +23,8 @@ struct Port { // mlir-air legacy Port() : bundle(), channel() {} - Port(StrmSwPortType b, int c) : bundle(b), channel(c) {} + Port(StrmSwPortType b, int c) + : bundle(b), channel(c) {} typedef std::tuple TupleType; Port(TupleType t) : Port(std::get<0>(t), std::get<1>(t)) {} operator TupleType() const { return {bundle, channel}; } @@ -109,12 +110,16 @@ bool existsPathToDest(const SwitchSettings &settings, TileLoc currTile, int finalDestChannel); struct PhysPort { + enum Direction { SRC, DST }; TileLoc tileLoc; Port port; - PhysPort(TileLoc t, Port p) : tileLoc(t), port(p) {} - using TupleType = std::tuple; - PhysPort(TupleType t) : PhysPort(std::get<0>(t), std::get<1>(t)) {} - operator TupleType() const { return {tileLoc, port}; } + Direction direction; + PhysPort(TileLoc t, Port p, Direction direction) + : tileLoc(t), port(p), direction(direction) {} + using TupleType = std::tuple; + PhysPort(TupleType t) + : PhysPort(std::get<0>(t), std::get<1>(t), std::get<2>(t)) {} + operator TupleType() const { return {tileLoc, port, direction}; } TUPLE_LIKE_STRUCT_RELATIONAL_OPS(PhysPort) }; @@ -166,7 +171,9 @@ TO_STRINGS(TO_STRING_DECL) _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::Port) \ _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::SwitchSetting) \ _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPort) \ - _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPortAndID) + _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPortAndID) \ + _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPort::Direction) + BOTH_OSTREAM_OPS_FORALL_ROUTER_TYPES(OSTREAM_OP_DECL, BOTH_OSTREAM_OP) diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index 0af56fd20..a29fe0898 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -318,6 +318,22 @@ uint32_t AMDAIEDeviceModel::getMemTileSize(uint8_t col, uint8_t row) const { return devInst.DevProp.DevMod[static_cast(tileType)].MemMod->Size; } +SmallVector AMDAIEDeviceModel::getMemSpaceRows( + uint8_t memSpace) const { + SmallVector res; + if (memSpace == 0) { + res.resize(deviceConfig.shimTileNumRows); + std::iota(res.begin(), res.end(), configPtr.ShimRowNum); + } else if (memSpace == 1) { + res.resize(configPtr.MemTileNumRows); + std::iota(res.begin(), res.end(), configPtr.MemTileRowStart); + } else if (memSpace == 2) { + res.resize(configPtr.AieTileNumRows); + std::iota(res.begin(), res.end(), configPtr.AieTileRowStart); + } + return res; +} + bool AMDAIEDeviceModel::hasLegalMemAffinity(uint8_t coreCol, uint8_t coreRow, uint8_t memCol, uint8_t memRow) const { @@ -483,6 +499,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { switch (device) { case AMDAIEDevice::xcvc1902: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIE1_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIE1_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE1_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE1_SS_MSEL_MAX; @@ -507,6 +524,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2302: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIEML_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; @@ -530,6 +548,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::xcve2802: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIEML_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX; @@ -557,6 +576,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { case AMDAIEDevice::npu1_3col: case AMDAIEDevice::npu1_4col: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIE2IPU_SHIM_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIE2IPU_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE2IPU_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE2IPU_SS_MSEL_MAX; @@ -603,6 +623,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) { } case AMDAIEDevice::npu4: { AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig; + deviceConfig.shimTileNumRows = XAIE_STRIXB0_MEM_TILE_NUM_ROWS; deviceConfig.packetIdMaxIdx = XAIE_STRIXB0_PACKET_ID_MAX; deviceConfig.streamSwitchCoreArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX; deviceConfig.streamSwitchCoreMSelMax = XAIE_STRIXB0_SS_MSEL_MAX; diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index a45f798ad..144e1dc62 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -201,7 +201,7 @@ inline ::XAie_TxnOpcode txnToTxn(XAie_TxnOpcode t) { } // mlir-air legacy -enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2 }; +enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2, AIE2p = 3 }; /* * This struct is meant to be a thin wrapper around aie-rt, which provides @@ -228,6 +228,12 @@ struct AMDAIEDeviceModel { /// retrieved in another way before adding new fields to this struct. struct AMDAIEDeviceConfig { + /////////////////////////////////////// + // AIE Array configuration constants // + /////////////////////////////////////// + /// The number of shim tile rows. Not found in aie-rt data structures, but + /// provided as `XAIE_SHIM_NUM_ROWS`. + uint8_t shimTileNumRows{1}; /// Set default minimum stride bitwidth/addressing granularity to 32 bits as /// this is the value for all current architecture versions. uint8_t minStrideBitWidth{32}; @@ -334,6 +340,8 @@ struct AMDAIEDeviceModel { uint32_t getMemTileSize(uint8_t col, uint8_t row) const; uint32_t getCoreTileLocalMemorySize() const; + SmallVector getMemSpaceRows(uint8_t memSpace) const; + uint32_t getNumBDs(uint8_t col, uint8_t row) const; uint32_t getNumSourceSwitchBoxConnections(uint8_t col, uint8_t row, @@ -356,7 +364,9 @@ struct AMDAIEDeviceModel { return deviceConfig.vectorLoadStoreAlignmentBits; } - uint32_t getMaxVectorSizeBits() const { return deviceConfig.maxVectorSizeBits; } + uint32_t getMaxVectorSizeBits() const { + return deviceConfig.maxVectorSizeBits; + } uint32_t getShiftOperandBits() const { return deviceConfig.shiftOperandBits; } diff --git a/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc b/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc index 655d33bc3..635c4eaa8 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc @@ -16,8 +16,9 @@ std::pair amsel(uint8_t a, uint8_t msel) { TEST(AMSelGeneratorTest, TileNotInitialized) { AMSelGenerator generator; TileLoc tileLoc(0, 1); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; + PhysPortAndID src1 = { + {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(failed(generator.addConnection(tileLoc, src1, {dst1}))); } @@ -25,8 +26,8 @@ TEST(AMSelGeneratorTest, NoArbitersNoMSels) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 0, 0); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); EXPECT_TRUE(failed(generator.solve())); } @@ -35,8 +36,8 @@ TEST(AMSelGeneratorTest, NoArbiters) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 0, 4); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); EXPECT_TRUE(failed(generator.solve())); } @@ -45,8 +46,8 @@ TEST(AMSelGeneratorTest, NoMSels) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 6, 0); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); EXPECT_TRUE(failed(generator.solve())); } @@ -55,19 +56,19 @@ TEST(AMSelGeneratorTest, SingleSrcSingleDst) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 6, 4); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); EXPECT_TRUE(succeeded(generator.solve())); EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0)); for (int i = 1; i < 6; i++) { - PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, i}}, i}; - PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, i}}, i}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, i}, PhysPort::Direction::DST}, i}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2}))); } EXPECT_TRUE(succeeded(generator.solve())); for (int i = 0; i < 6; i++) { - PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, i}}, i}; + PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i}; EXPECT_EQ(generator.getAMSel(tileLoc, src).value(), amsel(i, 0)); } } @@ -79,13 +80,13 @@ TEST(AMSelGeneratorTest, SingleSrcSingleDstSamePorts) { TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 6, 4); for (int i = 0; i < 6; i++) { - PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, i}; - PhysPortAndID dst = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, i}; + PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i}; + PhysPortAndID dst = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, i}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src, {dst}))); } EXPECT_TRUE(succeeded(generator.solve())); for (int i = 0; i < 6; i++) { - PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, i}; + PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i}; EXPECT_EQ(generator.getAMSel(tileLoc, src).value(), amsel(0, 0)); } } @@ -94,15 +95,15 @@ TEST(AMSelGeneratorTest, SingleSrcMultiDst) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 6, 4); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; - PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::EAST, 0}}, 0}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2}))); EXPECT_TRUE(succeeded(generator.solve())); EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0)); - PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1}; - PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 1}; - PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::EAST, 1}}, 1}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1}; + PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::DST}, 1}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4}))); EXPECT_TRUE(succeeded(generator.solve())); EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(1, 0)); @@ -113,19 +114,19 @@ TEST(AMSelGeneratorTest, MultiSrcSingleDst) { TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 6, 4); // Reuse msels for multiple sources. - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 0}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst1}))); EXPECT_TRUE(succeeded(generator.solve())); EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0)); EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(0, 0)); - PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 1}; - PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::EAST, 0}}, 1}; - PhysPortAndID src5 = {{{0, 1}, {StrmSwPortType::EAST, 1}}, 1}; - PhysPortAndID src6 = {{{0, 1}, {StrmSwPortType::EAST, 2}}, 1}; - PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 0}; + PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID src5 = {{{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID src6 = {{{0, 1}, {StrmSwPortType::EAST, 2}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst2}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src4, {dst2}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src5, {dst2}))); @@ -143,23 +144,23 @@ TEST(AMSelGeneratorTest, MultiSrcMultiDst) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 6, 4); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; - PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 0}; - PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 1}; - PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 2}}, 1}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0}; + PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1}; + PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4}))); EXPECT_TRUE(succeeded(generator.solve())); EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0)); EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(0, 1)); - PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 2}; - PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 3}; - PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::WEST, 0}}, 2}; - PhysPortAndID dst6 = {{{0, 1}, {StrmSwPortType::WEST, 1}}, 2}; - PhysPortAndID dst7 = {{{0, 1}, {StrmSwPortType::WEST, 0}}, 3}; - PhysPortAndID dst8 = {{{0, 1}, {StrmSwPortType::WEST, 2}}, 3}; + PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 2}; + PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 3}; + PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 2}; + PhysPortAndID dst6 = {{{0, 1}, {StrmSwPortType::WEST, 1}, PhysPort::Direction::DST}, 2}; + PhysPortAndID dst7 = {{{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 3}; + PhysPortAndID dst8 = {{{0, 1}, {StrmSwPortType::WEST, 2}, PhysPort::Direction::DST}, 3}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst5, dst6}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src4, {dst7, dst8}))); EXPECT_TRUE(succeeded(generator.solve())); @@ -173,14 +174,14 @@ TEST(AMSelGeneratorTest, ReuseArbiters) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 1, 4); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1}; - PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}}, 2}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; - PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 0}; - PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}}, 1}; - PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 3}}, 1}; - PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::NORTH, 4}}, 2}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0}; + PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1}; + PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 3}, PhysPort::Direction::DST}, 1}; + PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::NORTH, 4}, PhysPort::Direction::DST}, 2}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst5}))); @@ -194,18 +195,33 @@ TEST(AMSelGeneratorTest, ReuseArbitersFailure) { AMSelGenerator generator; TileLoc tileLoc(0, 1); generator.initTileIfNotExists(tileLoc, 1, 2); - PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0}; - PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1}; - PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}}, 2}; - PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0}; - PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 1}; - PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}}, 2}; + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1}; + PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1}; + PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 2}; EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2}))); EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst3}))); EXPECT_TRUE(failed(generator.solve())); } +TEST(AMSelGeneratorTest, DifferentDirections) { + AMSelGenerator generator; + TileLoc tileLoc(0, 1); + generator.initTileIfNotExists(tileLoc, 6, 4); + PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0}; + PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::SRC}, 0}; + PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::DST}, 0}; + EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1}))); + EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2}))); + EXPECT_TRUE(succeeded(generator.solve())); + EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0)); + EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(1, 0)); +} + } // namespace mlir::iree_compiler::AMDAIE int main(int argc, char **argv) { diff --git a/third_party/mlir-air b/third_party/mlir-air index 3bab1025d..f3884b6d0 160000 --- a/third_party/mlir-air +++ b/third_party/mlir-air @@ -1 +1 @@ -Subproject commit 3bab1025d02ffd2b14c0e887bb3749b4836936b2 +Subproject commit f3884b6d0e1910424d47f2310bba4666dd6c8105