diff --git a/build_tools/ci/cpu_comparison/performance_summarizer.py b/build_tools/ci/cpu_comparison/performance_summarizer.py
index 146be4aa7..401b06401 100644
--- a/build_tools/ci/cpu_comparison/performance_summarizer.py
+++ b/build_tools/ci/cpu_comparison/performance_summarizer.py
@@ -6,15 +6,22 @@
     import sys
 
     if len(sys.argv) != 2:
-        print("Usage: python3 performance_summarizer.py <path_to_log_file>")
+        print(
+            "Usage: python3 performance_summarizer.py <path_to_log_file>. This will strip out the performance numbers from the log file and print a summary."
+        )
         sys.exit(1)
     path = sys.argv[1]
     with open(path, "r") as f:
         lines = f.readlines()
     print("============================")
+    first_print = True
     for line in lines:
         if "Run #1" in line:
-            print(line.split()[-1])
+            if not first_print:
+                print("\n" + line.split()[-1])
+            else:
+                print(line.split()[-1])
+            first_print = False
         if "IREE_AMDAIE" in line:
-            print(line)
+            print(line.strip())
     print("============================")
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 755464245..79a73f101 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -25,9 +25,10 @@
 )
 
 
-def run_conv_test(config, filename, n_repeats):
+def run_conv_test(config, aie_compilation_flags, filename, n_repeats):
     aie_vs_llvm_cpu(
         config,
+        aie_compilation_flags,
         filename,
         tile_pipeline="conv-decompose",
         lower_to_aie_pipeline="objectFifo",
@@ -37,35 +38,58 @@ def run_conv_test(config, filename, n_repeats):
     return True
 
 
-class BaseTemplate(ABC):
+class BaseTest(ABC):
     """
-    Base class to be inherited by any new dispatch/compute op template.
-    The derived instances would therefore be created specifying the intended target
-    device(s) they're to be run; and would accordingly be `run` if the intended
-    target device(s) contains the `target_device` supplied via command-line.
+    Base class to be inherited by all tests. The derived instances will be
+    created specifying the intended target device(s) they're to be run on; and
+    will accordingly be `run` if the intended target device(s) contains
+    the `target_device` found in `config`. The default set of targets to run on
+    is the singleton set `["npu1_4col"]`.
+
+    An instance of this class has a member `aie_compilation_flags`
+    which are additional flags to be passed to the AIE backend compiler.
+
+    Compilation flags can therefore be injected into tests in 2 ways:
+    1) via the constructor of this base class
+    2) via the `add_aie_compilation_flags` method
     """
 
-    def __init__(self, run_on_target, additional_aie_compilation_flags=None):
-        self.run_on_target = run_on_target
-        self.additional_aie_compilation_flags = additional_aie_compilation_flags \
-            if additional_aie_compilation_flags is not None else []
+    def __init__(self, run_on_target=["npu1_4col"], aie_compilation_flags=None):
+        self.run_on_target = [] if run_on_target is None else run_on_target
+        self.aie_compilation_flags = (
+            [] if aie_compilation_flags is None else aie_compilation_flags
+        )
+        assert isinstance(self.aie_compilation_flags, list)
+        assert all(isinstance(flag, str) for flag in self.aie_compilation_flags)
+
+        # NB: derived classes should add labels to this list in their
+        # constructor, never overwrite it.
+        self.labels = ["All"]
+
+    def add_aie_compilation_flags(self, flags):
+        if flags:
+            if isinstance(flags, str):
+                flags = flags.split()
+            assert isinstance(flags, list)
+            assert all(isinstance(flag, str) for flag in flags)
+
+            self.aie_compilation_flags += flags
+            # unique-ify the list
+            self.aie_compilation_flags = list(set(self.aie_compilation_flags))
 
     def run(self, config):
-        for flag in self.additional_aie_compilation_flags:
-            config.add_additional_aie_compilation_flag(flag)
         if config.target_device in self.run_on_target:
             return self._execute(config)
-        # Return False to indicate that the test did not run.
         return False
 
     @abstractmethod
     def _execute(self, config):
-        pass
+        raise RuntimeError("Derived class must implement this method")
 
 
-class ConvolutionFromTemplate(BaseTemplate):
-    def __init__(self, params, run_on_target=["npu1_4col"]):
-        super().__init__(run_on_target)
+class ConvolutionFromTemplate(BaseTest):
+    def __init__(self, params):
+        super().__init__()
         self.generator = ConvolutionMlirGenerator(**params)
         params = self.generator.params
         conv_type = params["conv_type"]
@@ -75,7 +99,7 @@ def __init__(self, params, run_on_target=["npu1_4col"]):
         out_type = params["output_element_type"]
         # TODO(newling) Use all parameters in name, to avoid name collision.
         self.name = f"{conv_type}_{N}_{IW}_{in_type}_{out_type}"
-        self.labels = ["Convolution"]
+        self.labels += ["Convolution"]
 
     def _execute(self, config):
         # Generate MLIR file:
@@ -83,73 +107,91 @@ def _execute(self, config):
         filename = output_dir / f"{self.name}.mlir"
         self.generator.write_to_file(filename)
         # Perform numerical comparison between AIE and CPU:
-        return run_conv_test(config, filename, n_repeats=2)
+        return run_conv_test(config, self.aie_compilation_flags, filename, n_repeats=2)
 
 
-class ConvolutionNHWCQ(BaseTemplate):
-    def __init__(self, run_on_target=["npu1_4col"]):
-        super().__init__(run_on_target)
+class ConvolutionNHWCQ(BaseTest):
+    def __init__(self):
+        super().__init__()
         self.name = "convolution_nhwc_q"
-        self.labels = ["Convolution", "ConvolutionNHWCQ"]
+        self.labels += ["Convolution", "ConvolutionNHWCQ"]
 
     def _execute(self, config):
         files_dir = config.file_dir / "test_files"
         filename = files_dir / "conv2d_nhwc_q.mlir"
-        return run_conv_test(config, filename, n_repeats=1)
+        return run_conv_test(config, self.aie_compilation_flags, filename, n_repeats=1)
 
 
-class MultipleDispatches(BaseTemplate):
-    def __init__(self, name, run_on_target=["npu1_4col"]):
-        super().__init__(run_on_target)
+class MultipleDispatches(BaseTest):
+    def __init__(self, name):
+        super().__init__()
         self.name = name
-        self.labels = ["Matmul", "MultipleDispatches"]
+        self.labels += ["Matmul", "MultipleDispatches"]
 
     def _execute(self, config):
         test_files_dir = config.file_dir / "test_files"
         self.filename = test_files_dir / f"{self.name}.mlir"
+        # TODO(newling) did Maks ever document why this is here, if so add an
+        # explainer.
         if config.xdna_datetime and config.xdna_datetime < 20240801:
-            aie_vs_llvm_cpu(config, self.filename, function_name="three_$mm$")
+            aie_vs_llvm_cpu(
+                config,
+                self.aie_compilation_flags,
+                self.filename,
+                function_name="three_$mm$",
+            )
             return True
         else:
             # Return False to indicate that the test did not run.
             return False
 
 
-class BaseMatmul(BaseTemplate):
+class BaseMatmul(BaseTest):
     def __init__(
         self,
+        run_on_target,
+        aie_compilation_flags,
         M,
         N,
-        K, 
+        K,
         input_type,
         acc_type,
-        run_on_target=["npu1_4col"],
-        additional_compilation_flags=None
+        use_ukernel=False,
+        lower_to_aie_pipeline="objectFifo",
+        tile_pipeline="pack-peel",
+        n_repeats=1,
     ):
-        super().__init__(run_on_target, additional_compilation_flags)
-        self.labels = []
+        super().__init__(run_on_target, aie_compilation_flags)
         self.M = M
         self.N = N
         self.K = K
         self.input_type = input_type
         self.acc_type = acc_type
+        self.tile_pipeline = tile_pipeline
+        self.lower_to_aie_pipeline = lower_to_aie_pipeline
+        self.use_ukernel = use_ukernel
+        self.n_repeats = n_repeats
         self.labels.append("Matmul")
+        if use_ukernel:
+            self.labels.append("UKernel")
 
+    def vs_cpu(self, config, filename):
+        if self.use_ukernel and not config.vitis_dir:
+            return False
 
-class MatmulFullBias(BaseMatmul):
-    """
-    A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
-    """
+        aie_vs_llvm_cpu(
+            config=config,
+            aie_compilation_flags=self.aie_compilation_flags,
+            test_file=filename,
+            use_ukernel=self.use_ukernel,
+            tile_pipeline=self.tile_pipeline,
+            lower_to_aie_pipeline=self.lower_to_aie_pipeline,
+            n_repeats=self.n_repeats,
+        )
 
-    def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
-        super().__init__(M, N, K, input_type, acc_type, run_on_target)
-        self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}"
-        self.labels.append("MatmulFullBias")
+        return True
 
-    def _execute(self, config):
-        filename = config.output_dir / f"{self.name}.mlir"
-        matmul_template_dir = config.file_dir / "matmul_template"
-        template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
+    def generate(self, filename, template_name):
         generate_matmul_test(
             filename,
             template_name,
@@ -159,14 +201,33 @@ def _execute(self, config):
             self.input_type,
             self.acc_type,
         )
-        aie_vs_llvm_cpu(
-            config,
-            filename,
-            tile_pipeline="pack-peel",
-            # TODO(someone) This should work for "objectFifo".
+
+
+class MatmulFullBias(BaseMatmul):
+    """
+    A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
+    """
+
+    def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=None,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
             lower_to_aie_pipeline="air",
         )
+        self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}"
+        self.labels.append("MatmulFullBias")
 
+    def _execute(self, config):
+        filename = config.output_dir / f"{self.name}.mlir"
+        matmul_template_dir = config.file_dir / "matmul_template"
+        template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
+        self.generate(filename, template_name)
+        self.vs_cpu(config, filename)
         return True
 
 
@@ -177,39 +238,47 @@ class VanillaMatmul(BaseMatmul):
 
     def __init__(
         self,
-        name,
         M,
         N,
         K,
         input_type,
         acc_type,
-        use_ukernel,
+        name_suffix="",
+        use_ukernel=False,
         run_on_target=["npu1_4col"],
-        additional_labels=[],
-        additional_aie_compilation_flags=None,
+        additional_labels=None,
+        aie_compilation_flags=None,
+        n_repeats=1,
     ):
-        super().__init__(M, N, K, input_type, acc_type, run_on_target, 
-            additional_aie_compilation_flags)
-        self.name = f"vanilla_matmul_{name}_{M}_{N}_{K}_{input_type}_{acc_type}"
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=aie_compilation_flags,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            tile_pipeline="pack-peel",
+            use_ukernel=use_ukernel,
+            n_repeats=n_repeats,
+        )
+
+        self.name = f"vanilla_matmul_{M}_{N}_{K}_{input_type}_{acc_type}"
+        if name_suffix:
+            self.name += f"_{name_suffix}"
+        if use_ukernel:
+            self.name += "_ukernel"
         self.labels.append("VanillaMatmul")
-        self.labels += additional_labels
+        if additional_labels:
+            self.labels += additional_labels
         self.use_ukernel = use_ukernel
 
     def _execute(self, config):
         self.filename = config.output_dir / f"{self.name}.mlir"
         matmul_template_dir = config.file_dir / "matmul_template"
         template_name = matmul_template_dir / "matmul_MxK_KxN.mlir"
-        generate_matmul_test(
-            self.filename,
-            template_name,
-            self.M,
-            self.N,
-            self.K,
-            self.input_type,
-            self.acc_type,
-        )
-
-        aie_vs_llvm_cpu(config, self.filename, use_ukernel=self.use_ukernel)
+        self.generate(self.filename, template_name)
+        self.vs_cpu(config, self.filename)
 
         return True
 
@@ -220,43 +289,38 @@ class MatmulThinBias(BaseMatmul):
     """
 
     def __init__(
-        self, M, N, K, input_type, acc_type, use_ukernel, run_on_target=["npu1_4col"]
+        self,
+        M,
+        N,
+        K,
+        input_type,
+        acc_type,
+        use_ukernel=False,
+        run_on_target=["npu1_4col"],
     ):
-        super().__init__(M, N, K, input_type, acc_type, run_on_target)
-        tail = "" if use_ukernel else "ukernel"
-        self.name = f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}_{tail}"
-        self.labels.append("MatmulThinBias")
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=None,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            lower_to_aie_pipeline="air",
+            use_ukernel=use_ukernel,
+        )
+
+        self.name = f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}"
         if use_ukernel:
-            self.labels.append("UKernel")
-        self.use_ukernel = use_ukernel
+            self.name += "_ukernel"
+        self.labels.append("MatmulThinBias")
 
     def _execute(self, config):
         self.filename = config.output_dir / f"{self.name}.mlir"
         matmul_template_dir = config.file_dir / "matmul_template"
         template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
-        generate_matmul_test(
-            self.filename,
-            template_name,
-            self.M,
-            self.K,
-            self.N,
-            self.input_type,
-            self.acc_type,
-        )
-
-        if self.use_ukernel and not config.vitis_dir:
-            return False
-
-        else:
-            aie_vs_llvm_cpu(
-                config,
-                self.filename,
-                tile_pipeline="pack-peel",
-                # TODO(someone) This should work for "objectFifo".
-                lower_to_aie_pipeline="air",
-                use_ukernel=self.use_ukernel,
-            )
-            return True
+        self.generate(self.filename, template_name)
+        return self.vs_cpu(config, self.filename)
 
 
 class BatchMatmul(BaseMatmul):
@@ -265,7 +329,17 @@ class BatchMatmul(BaseMatmul):
     """
 
     def __init__(self, B, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
-        super().__init__(M, N, K, input_type, acc_type, run_on_target)
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=None,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            tile_pipeline="pack-peel",
+            n_repeats=1,
+        )
 
         self.name = f"batch_matmul_{B}_{M}_{N}_{K}_{input_type}_{acc_type}"
         self.labels.append("BatchMatmul")
@@ -285,12 +359,7 @@ def _execute(self, config):
             lhs_rhs_type=self.input_type,
             acc_type=self.acc_type,
         )
-        aie_vs_llvm_cpu(
-            config,
-            self.filename,
-        )
-
-        return True
+        return self.vs_cpu(config, self.filename)
 
 
 class MatmulTruncf(BaseMatmul):
@@ -309,56 +378,65 @@ def __init__(
         expected_out,
         run_on_target=["npu1_4col"],
     ):
-        super().__init__(M, M, K, input_type, acc_type, run_on_target)
-        self.name = f"matmul_truncf_{M}_{K}_{input_type}_{acc_type}"
-        self.labels.append("MatmulTruncf")
-        self.lhs = lhs
-        self.rhs = rhs
-        self.expected_out = expected_out
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=None,
+            M=M,
+            N=M,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            tile_pipeline="pack-peel",
+            n_repeats=1,
+        )
 
         # Assertions on shapes: Check that lhs is MxK, rhs is KxM, and expected_out is MxM
         assert lhs.shape == (M, K)
         assert rhs.shape == (K, M)
         assert expected_out.shape == (M, M)
 
+        self.name = f"matmul_truncf_{M}_{K}_{input_type}_{acc_type}"
+        self.labels.append("MatmulTruncf")
+        self.lhs = lhs
+        self.rhs = rhs
+        self.expected_out = expected_out
+
     def _execute(self, config):
         self.filename = config.output_dir / f"{self.name}.mlir"
         matmul_template_dir = config.file_dir / "matmul_template"
         template_name = matmul_template_dir / "matmul_truncf_MxK_KxN.mlir"
-        generate_matmul_test(
-            self.filename,
-            template_name,
-            self.M,
-            self.N,
-            self.K,
-            self.input_type,
-            self.acc_type,
-        )
+        self.generate(self.filename, template_name)
 
         input_args = generate_inputs(
             self.filename, config.output_dir, 1, {1: self.lhs, 2: self.rhs}
         )
         """
         currently without enabling loop coalescing and unit dimension collapsing
-        we run out of program memory, this is under investigation.
+        we run out of program memory, this is under investigation. We also 
+        enable function outlining.
         """
+        self.add_aie_compilation_flags(
+            [
+                "--iree-amdaie-enable-coalescing-loops",
+                "--iree-amdaie-enable-collapsing-unit-dims",
+                "--iree-amdaie-enable-function-outlining",
+            ]
+        )
         aie_vs_baseline(
-            config,
-            self.filename,
-            input_args,
-            self.expected_out,
-            use_ukernel=False,
-            tile_pipeline="pack-peel",
-            lower_to_aie_pipeline="objectFifo",
+            config=config,
+            aie_compilation_flags=self.aie_compilation_flags,
+            test_file=self.filename,
+            input_args=input_args,
+            baseline_value=self.expected_out,
+            use_ukernel=self.use_ukernel,
+            tile_pipeline=self.tile_pipeline,
             function_name=None,
             seed=1,
             rtol=0,
             atol=0,
-            n_repeats=1,
+            lower_to_aie_pipeline=self.lower_to_aie_pipeline,
+            n_repeats=self.n_repeats,
             output_type=get_output_type(self.filename),
-            coalesce_loops=True,
-            collapse_unit_dims=True,
-            function_outline=True,
         )
 
         return True
@@ -439,6 +517,7 @@ def shell_out(cmd: list, workdir=None, verbose: int = 0, raise_on_error=True, en
 
 def generate_aie_vmfb(
     config,
+    aie_compilation_flags,
     name,
     tile_pipeline,
     lower_to_aie_pipeline,
@@ -446,18 +525,15 @@ def generate_aie_vmfb(
     test_file,
     input_args,
     function_name,
-    coalesce_loops=False,
-    collapse_unit_dims=False,
-    function_outline=False,
 ):
     """
     Compile a test file for IREE's AIE backend, returning the path to the
     compiled module.
     """
 
-    additional_flags = config.additional_aie_compilation_flags.split()
+    additional_flags = aie_compilation_flags
 
-    compilation_flags = [
+    aie_compilation_flags = [
         config.iree_compile_exe,
         test_file,
         "--iree-hal-target-backends=amd-aie",
@@ -479,31 +555,22 @@ def generate_aie_vmfb(
     ]
 
     if config.verbose:
-        compilation_flags += ["--iree-amd-aie-show-invoked-commands"]
+        aie_compilation_flags += ["--iree-amd-aie-show-invoked-commands"]
 
     if use_ukernel:
-        compilation_flags += ["--iree-amdaie-enable-ukernels=all"]
-
-    if coalesce_loops:
-        compilation_flags += ["--iree-amdaie-enable-coalescing-loops"]
-
-    if collapse_unit_dims:
-        compilation_flags += ["--iree-amdaie-enable-collapsing-unit-dims"]
-    
-    if function_outline:
-        compilation_flags += ["--iree-amdaie-enable-function-outlining"]
+        aie_compilation_flags += ["--iree-amdaie-enable-ukernels=all"]
 
     for additional_flag in additional_flags:
-        if additional_flag not in compilation_flags:
-            compilation_flags += [additional_flag]
+        if additional_flag not in aie_compilation_flags:
+            aie_compilation_flags += [additional_flag]
 
-    compilation_flags += [
+    aie_compilation_flags += [
         "-o",
         config.output_dir / f"{name}_aie.vmfb",
     ]
 
     start = time.monotonic_ns()
-    shell_out(compilation_flags, config.output_dir, config.verbose)
+    shell_out(aie_compilation_flags, config.output_dir, config.verbose)
     compile_time = time.monotonic_ns() - start
     if config.verbose:
         print(f"Time spent in compilation: {compile_time // 1e6} [ms]")
@@ -562,7 +629,7 @@ def generate_llvm_cpu_output(
     """
 
     cpu_vmfb = config.output_dir / f"{name}_cpu.vmfb"
-    compilation_flags = [
+    aie_compilation_flags = [
         config.iree_compile_exe,
         test_file,
         "--iree-hal-target-backends=llvm-cpu",
@@ -570,7 +637,7 @@ def generate_llvm_cpu_output(
         "-o",
         f"{cpu_vmfb}",
     ]
-    shell_out(compilation_flags, workdir=config.output_dir, verbose=config.verbose)
+    shell_out(aie_compilation_flags, workdir=config.output_dir, verbose=config.verbose)
 
     cpu_bin = config.output_dir / f"{name}_cpu.bin"
     run_args = [
@@ -587,8 +654,7 @@ def generate_llvm_cpu_output(
 
 class TestConfig:
     """
-    Global state used for all tests. Stores paths to executables used, and
-    records test failures.
+    Global state used for all tests. Stores paths to executables used.
     """
 
     def __init__(
@@ -602,10 +668,8 @@ def __init__(
         iree_compile_exe,
         iree_run_exe,
         verbose,
-        return_on_fail,
         reset_npu_between_runs,
         do_not_run_aie,
-        additional_aie_compilation_flags,
         device_hal,
         xrt_lite_n_core_rows,
         xrt_lite_n_core_cols,
@@ -620,13 +684,11 @@ def __init__(
         self.file_dir = file_dir
         self.iree_compile_exe = iree_compile_exe
         self.iree_run_exe = iree_run_exe
-        self.return_on_fail = return_on_fail
         self.verbose = verbose
         self.xdna_datetime = None
         self.xdna_hash = None
         self.reset_npu_between_runs = reset_npu_between_runs
         self.do_not_run_aie = do_not_run_aie
-        self.additional_aie_compilation_flags = additional_aie_compilation_flags
         self.device_hal = device_hal
         self.xrt_lite_n_core_rows = xrt_lite_n_core_rows
         self.xrt_lite_n_core_cols = xrt_lite_n_core_cols
@@ -647,9 +709,6 @@ def __init__(
                 f"The file {self.reset_npu_script} does not exist, and reset_npu_script=True"
             )
 
-        # Populated at runtime
-        self.failures = []
-
         if not isinstance(self.verbose, bool) and not isinstance(self.verbose, int):
             raise ValueError(
                 f"verbose must be a boolean or integer, not {type(verbose)}"
@@ -727,9 +786,6 @@ def __init__(
             if peano_commit_hash:
                 self.peano_commit_hash = peano_commit_hash[0]
 
-    def add_additional_aie_compilation_flag(self, flag):
-        self.additional_aie_compilation_flags += flag
-
     def __str__(self):
         return dedent(
             f"""
@@ -746,7 +802,6 @@ def __str__(self):
         peano_commit_hash:    {self.peano_commit_hash}
         peano_dir:            {self.peano_dir}
         reset_npu_script:     {self.reset_npu_script}
-        return_on_fail:       {self.return_on_fail}
         use_chess:            {self.use_chess}
         verbose:              {self.verbose}
         vitis_dir:            {self.vitis_dir}
@@ -798,6 +853,7 @@ def name_from_mlir_filename(mlir_filename):
 
 def aie_vs_baseline(
     config,
+    aie_compilation_flags,
     test_file,
     input_args,
     baseline_value,
@@ -810,13 +866,10 @@ def aie_vs_baseline(
     atol,
     n_repeats,
     output_type,
-    coalesce_loops=False,
     collapse_unit_dims=False,
     function_outline=False,
 ):
     """
-    If the outputs differ, add the test file to a list of failures.
-
     Arguments to the function are:
     config:
         TestConfig containing any state which is common to all tests
@@ -839,8 +892,6 @@ def aie_vs_baseline(
     n_repeats:
         The number of times to run the test. This is useful for tests which
         may pass only sometimes due to driver issues, etc.
-    coalesce_loops:
-        Whether to enable coalescing of loops when compiling for AIE backend
     collapse_unit_dims:
         Whether to enable collapsing of unit dimensions when compiling for AIE backend
     function_outline:
@@ -851,6 +902,7 @@ def aie_vs_baseline(
 
     aie_vmfb = generate_aie_vmfb(
         config,
+        aie_compilation_flags,
         name,
         tile_pipeline,
         lower_to_aie_pipeline,
@@ -858,9 +910,6 @@ def aie_vs_baseline(
         test_file,
         input_args,
         function_name,
-        coalesce_loops,
-        collapse_unit_dims,
-        function_outline,
     )
 
     if config.do_not_run_aie:
@@ -884,13 +933,12 @@ def aie_vs_baseline(
         summary_string = compare(baseline_value, aie_output, rtol, atol)
         if summary_string:
             print(summary_string)
-            config.failures.append(test_file)
-            if config.return_on_fail:
-                raise RuntimeError("Test failed, exiting.")
+            raise RuntimeError("Test failed, exiting.")
 
 
 def aie_vs_llvm_cpu(
     config,
+    aie_compilation_flags,
     test_file,
     use_ukernel=False,
     tile_pipeline="pack-peel",
@@ -927,6 +975,7 @@ def aie_vs_llvm_cpu(
 
     aie_vs_baseline(
         config,
+        aie_compilation_flags,
         test_file,
         input_args,
         cpu_output,
@@ -944,6 +993,10 @@ def aie_vs_llvm_cpu(
 
 class Tests:
 
+    def add_aie_compilation_flags(self, flags):
+        for test in self.tests:
+            test.add_aie_compilation_flags(flags)
+
     def register(self, test):
         self.tests.append(test)
         if test.name in self.existing_names:
@@ -1004,58 +1057,71 @@ def __init__(self):
             self.register(BatchMatmul(2, 64, 64, 64, input_type, acc_type))
 
         # MatmulThinBias test(s):
-        self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", True))
-        self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", False))
+        self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", use_ukernel=True))
+        self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32"))
 
         # VanillaMatmul test(s):
         self.register(
             VanillaMatmul(
-                "scalar_i32",
                 32,
                 32,
                 32,
                 "i32",
                 "i32",
-                use_ukernel=False,
                 run_on_target=["npu1_4col", "npu4"],
             )
         )
         self.register(
             VanillaMatmul(
-                "infinite_loop",
                 32,
                 32,
                 32,
                 "i32",
                 "i32",
-                use_ukernel=False,
+                name_suffix="infinite_loop",
                 run_on_target=["npu1_4col", "npu4"],
-                additional_aie_compilation_flags=["--iree-amdaie-enable-infinite-loop-around-core-block=true"]
+                aie_compilation_flags=[
+                    "--iree-amdaie-enable-infinite-loop-around-core-block=true"
+                ],
             )
         )
-        self.register(VanillaMatmul("bfloat", 32, 32, 64, "bf16", "f32", use_ukernel=False))
-        
+        self.register(VanillaMatmul(32, 32, 64, "bf16", "f32"))
 
         # TODO: Failure is expected for the 128x128 case we don't yet understand why.
         self.register(
             VanillaMatmul(
-                "bfloat_ukernel", 64, 64, 64, "bf16", "f32", use_ukernel=True, run_on_target=["npu4"]
-            )
-        )
-
-        self.register(
-            VanillaMatmul(
-                "bfloat_perf",
-                512,
-                512,
-                4096,
+                64,
+                64,
+                64,
                 "bf16",
                 "f32",
-                use_ukernel=False,
-                additional_labels=["Performance"],
+                use_ukernel=True,
+                run_on_target=["npu4"],
             )
         )
 
+        # Some bf16 Performance tests:
+        for M, N, K, use_ukernel in [
+            (512, 512, 4096, False),
+            (512, 512, 4096, True),
+            (512, 4096, 512, False),
+            (512, 4096, 512, True),
+            (4096, 512, 512, False),
+            (4096, 512, 512, True),
+        ]:
+            self.register(
+                VanillaMatmul(
+                    M,
+                    N,
+                    K,
+                    "bf16",
+                    "f32",
+                    additional_labels=["Performance"],
+                    use_ukernel=use_ukernel,
+                    n_repeats=2,
+                )
+            )
+
         # MultipleDispatches tests:
         for name in ["two_matmul_switching", "matmul_f32_8_8_4", "matmul_f32_8_4_8"]:
             self.register(MultipleDispatches(name))
@@ -1102,12 +1168,10 @@ def all_tests(
     peano_dir,
     xrt_dir,
     vitis_dir,
-    return_on_fail,
     verbose,
     reset_npu_between_runs,
     do_not_run_aie,
     test_set,
-    additional_aie_compilation_flags,
     device_hal,
     xrt_lite_n_core_rows,
     xrt_lite_n_core_cols,
@@ -1138,7 +1202,7 @@ def all_tests(
         raise RuntimeError(f"'{iree_install_dir}' is not a directory.")
     iree_compile_exe = find_executable(iree_install_dir, "iree-compile")
     iree_run_exe = find_executable(iree_install_dir, "iree-run-module")
-    file_dir = Path(__file__).parent
+    file_dir = Path(os.path.dirname(os.path.abspath(__file__)))
 
     config = TestConfig(
         output_dir,
@@ -1150,10 +1214,8 @@ def all_tests(
         iree_compile_exe,
         iree_run_exe,
         verbose,
-        return_on_fail,
         reset_npu_between_runs,
         do_not_run_aie,
-        additional_aie_compilation_flags,
         device_hal,
         xrt_lite_n_core_rows,
         xrt_lite_n_core_cols,
@@ -1180,8 +1242,8 @@ def all_tests(
     for test in tests.tests:
 
         # Determine if the test is a match for the test_set provided by caller
-        match = "All" in test_set
-        match = match or test.name in test_set
+        # match = "All" in test_set
+        match = test.name in test_set
         for label in test.labels:
             match = match or label in test_set
 
@@ -1194,20 +1256,6 @@ def all_tests(
         else:
             not_match.append(test.name)
 
-    if config.failures:
-        # Convert the list of failed tests into a map: test name to the
-        # number of failures (config.failures list may contain duplicates)
-        failures_map = {}
-        for test in config.failures:
-            if test in failures_map:
-                failures_map[test] += 1
-            else:
-                failures_map[test] = 1
-        error_string = "The following tests failed:"
-        for test, count in failures_map.items():
-            error_string += f"\n   {test} ({count} times)."
-        raise RuntimeError(error_string)
-
     if verbose:
         print(f"Tests that ran: {match_run}")
         print(f"Tests that matched but did not run: {match_not_run}")
@@ -1254,20 +1302,6 @@ def all_tests(
         "--target_device", type=str, required=True, help=target_device_help_string
     )
 
-    # TODO(newling) make bool options boolean, not integer (tried but had issues)
-    parser.add_argument(
-        "--return-on-fail",
-        nargs="?",
-        default=1,
-        type=int,
-        help=dedent(
-            """
-            If 0, then the script will continue running even if a test fails,
-            enumerating all failures. Otherwise the script will exit on the first failure.
-            """
-        ),
-    )
-
     parser.add_argument(
         "-v",
         "--verbose",
@@ -1306,6 +1340,20 @@ def all_tests(
         ),
     )
 
+    parser.add_argument(
+        "--aie-compilation-flags",
+        type=str,
+        help=dedent(
+            """
+            Additional flags to pass to the AIE compiler, for all tests.
+            Example, to print the IR between passes during compilation you might have:
+            --aie_compilation_flags="--mlir-print-ir-before-all --mlir-print-ir-module-scope
+            --aie2xclbin-print-ir-before-all --aie2xclbin-print-ir-module-scope"'
+            """
+        ),
+        default="",
+    )
+
     tests = Tests()
     labels = tests.get_label_set()
     labels.append("All")
@@ -1326,20 +1374,6 @@ def all_tests(
         default="All",
     )
 
-    parser.add_argument(
-        "--additional-aie-compilation-flags",
-        type=str,
-        help=dedent(
-            """
-            Additional flags to pass to the AIE compiler, for all tests.
-            Example, do print the IR between passes during compilation you might have:
-            --additional-aie-compilation-flags="--mlir-print-ir-before-all --mlir-print-ir-module-scope
-            --aie2xclbin-print-ir-before-all --aie2xclbin-print-ir-module-scope"
-            """
-        ),
-        default="",
-    )
-
     parser.add_argument(
         "--device-hal",
         default="xrt-lite",
@@ -1365,6 +1399,7 @@ def all_tests(
         raise ValueError(
             f"Invalid target device '{args.target_device}'. Available options: {current_devices}"
         )
+    tests.add_aie_compilation_flags(args.aie_compilation_flags)
 
     all_tests(
         tests,
@@ -1373,12 +1408,10 @@ def all_tests(
         args.peano_install_dir,
         args.xrt_dir,
         args.vitis_dir,
-        args.return_on_fail,
         args.verbose,
         args.reset_npu_between_runs,
         args.do_not_run_aie,
         test_set_list,
-        args.additional_aie_compilation_flags,
         args.device_hal,
         args.xrt_lite_n_core_rows,
         args.xrt_lite_n_core_cols,
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEOps.td b/compiler/plugins/target/AMD-AIE/aie/AIEOps.td
index 729295535..9b8691810 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEOps.td
@@ -24,7 +24,7 @@ class AIE_Op<string mnemonic, list<Trait> traits = []> :
 
 def AIE_DeviceOp: AIE_Op<"device", [
     HasParent<"mlir::ModuleOp">,
-    SymbolTable, SingleBlock, NoTerminator, IsolatedFromAbove
+    SymbolTable, SingleBlockImplicitTerminator<"EndOp">, IsolatedFromAbove
   ]> {
   let summary = "Define an AIE design targetting a complete device";
   let arguments = (ins AMDAIEDeviceAttr:$device);
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEX.td b/compiler/plugins/target/AMD-AIE/aie/AIEX.td
index b52b1dc57..783f5c001 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEX.td
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEX.td
@@ -57,7 +57,13 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
         OptionalAttr<PacketInfoAttr>:$packet,
         FlatSymbolRefAttr:$metadata,
         I64Attr:$id,
-        DefaultValuedOptionalAttr<BoolAttr, "false">:$issue_token
+        DefaultValuedOptionalAttr<BoolAttr, "false">:$issue_token,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d0_zero_before,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d1_zero_before,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d2_zero_before,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d0_zero_after,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d1_zero_after,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d2_zero_after
   );
 
   let assemblyFormat = [{
@@ -186,6 +192,7 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd"> {
         I32Attr:$d0_stride,
         I32Attr:$d1_size,
         I32Attr:$d1_stride,
+        I32Attr:$d2_size,
         I32Attr:$d2_stride,
         I32Attr:$iteration_current,
         I32Attr:$iteration_size,
@@ -198,7 +205,13 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd"> {
         I32Attr:$lock_rel_id,
         I32Attr:$lock_acq_enable,
         I32Attr:$lock_acq_val,
-        I32Attr:$lock_acq_id
+        I32Attr:$lock_acq_id,
+        I32Attr:$d0_zero_before,
+        I32Attr:$d1_zero_before,
+        I32Attr:$d2_zero_before,
+        I32Attr:$d0_zero_after,
+        I32Attr:$d1_zero_after,
+        I32Attr:$d2_zero_after
   );
   let results = (outs );
   let assemblyFormat = [{ attr-dict }];
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp b/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp
index b604b7e30..0114adcee 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AIEXDialect.cpp
@@ -30,6 +30,10 @@ void AIEXDialect::initialize() {
 #define GET_OP_CLASSES
 #include "aie/AIEX.cpp.inc"
 
+//===----------------------------------------------------------------------===//
+// NpuDmaMemcpyNdOp
+//===----------------------------------------------------------------------===//
+
 llvm::SmallVector<int64_t, 4>
 AIEX::NpuDmaMemcpyNdOp::getStridesInAddressGranularity() {
   MemRefType buffer = getMemref().getType();
diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp
index 2e790a7bf..171898f83 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIECreatePathFindFlows.cpp
@@ -194,7 +194,8 @@ LogicalResult runOnPacketFlow(
         Port destPort = {(pktDest.getBundle()), pktDest.getChannel()};
         TileLoc destCoord = {destTile.getCol(), destTile.getRow()};
         if (pktFlowOp->hasAttr("keep_pkt_header"))
-          keepPktHeaderAttr[PhysPort{destCoord, destPort}] =
+          keepPktHeaderAttr[PhysPort{destCoord, destPort,
+                                     PhysPort::Direction::DST}] =
               StringAttr::get(Op.getContext(), "true");
         assert(srcPort.bundle != StrmSwPortType::SS_PORT_TYPE_MAX &&
                srcPort.channel != -1 && "expected srcPort to have been set");
@@ -243,8 +244,10 @@ LogicalResult runOnPacketFlow(
   SmallVector<PhysPortAndID> slavePorts;
   for (const auto &[tileId, connects] : switchboxes) {
     for (const auto &[conn, flowID] : connects) {
-      PhysPortAndID sourceFlow = {PhysPort{tileId, conn.src}, flowID};
-      packetFlows[sourceFlow].insert({PhysPort{tileId, conn.dst}, flowID});
+      PhysPortAndID sourceFlow = {
+          PhysPort{tileId, conn.src, PhysPort::Direction::SRC}, flowID};
+      packetFlows[sourceFlow].insert(
+          {PhysPort{tileId, conn.dst, PhysPort::Direction::DST}, flowID});
       slavePorts.push_back(sourceFlow);
     }
   }
@@ -307,7 +310,7 @@ LogicalResult runOnPacketFlow(
     std::sort(tileMasters.begin(), tileMasters.end());
     for (Port tileMaster : tileMasters) {
       std::vector<std::pair<uint8_t, uint8_t>> amsels =
-          masterSets[{tileLoc, tileMaster}];
+          masterSets[{tileLoc, tileMaster, PhysPort::Direction::DST}];
       std::vector<Value> amselVals;
       for (std::pair<uint8_t, uint8_t> amsel : amsels) {
         assert(amselOps.count(amsel) == 1 && "expected amsel in amselOps");
@@ -316,7 +319,8 @@ LogicalResult runOnPacketFlow(
       auto msOp = builder.create<MasterSetOp>(
           builder.getUnknownLoc(), builder.getIndexType(), (tileMaster.bundle),
           tileMaster.channel, amselVals);
-      if (auto pktFlowAttrs = keepPktHeaderAttr[{tileLoc, tileMaster}])
+      if (auto pktFlowAttrs = keepPktHeaderAttr[{tileLoc, tileMaster,
+                                                 PhysPort::Direction::DST}])
         msOp->setAttr("keep_pkt_header", pktFlowAttrs);
     }
 
diff --git a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
index 7d8df1b1d..06bdea543 100644
--- a/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/AMDAIEDmaToNpu.cpp
@@ -301,6 +301,7 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto d0_stride = zero;
     auto d1_size = zero;
     auto d1_stride = zero;
+    auto d2_size = zero;
     auto d2_stride = zero;
     auto iteration_current = zero;
     auto iteration_size = zero;
@@ -314,6 +315,12 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto lock_acq_enable = zero;
     auto lock_acq_val = zero;
     auto lock_acq_id = zero;
+    auto d0_zero_before = zero;
+    auto d1_zero_before = zero;
+    auto d2_zero_before = zero;
+    auto d0_zero_after = zero;
+    auto d1_zero_after = zero;
+    auto d2_zero_after = zero;
 
     auto issue_token = BoolAttr::get(ctx, false);
     auto repeat_count = zero;
@@ -361,6 +368,9 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     // d1_stride
     if (strides[1]) d1_stride = IntegerAttr::get(i32ty, strides[1] - 1);
 
+    // d2_size
+    if (strides[3]) d2_size = IntegerAttr::get(i32ty, sizes[2]);
+
     // d2_stride
     if (strides[2]) d2_stride = IntegerAttr::get(i32ty, strides[2] - 1);
 
@@ -389,12 +399,32 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     // This logic is kept for now for backward compatibility.
     if (!isMM2S) issue_token = BoolAttr::get(ctx, true);
 
+    // d0_zero_before
+    d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore());
+
+    // d1_zero_before
+    d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore());
+
+    // d2_zero_before
+    d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore());
+
+    // d0_zero_after
+    d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter());
+
+    // d1_zero_after
+    d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter());
+
+    // d2_zero_after
+    d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter());
+
     rewriter.create<NpuWriteBdOp>(
         op->getLoc(), column, bd_id, buffer_length, buffer_offset,
         enable_packet, out_of_order_id, packet_id, packet_type, d0_size,
-        d0_stride, d1_size, d1_stride, d2_stride, iteration_current,
+        d0_stride, d1_size, d1_stride, d2_size, d2_stride, iteration_current,
         iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd,
-        lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id);
+        lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id,
+        d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after,
+        d1_zero_after, d2_zero_after);
 
     AMDAIEDeviceModel tm =
         getDeviceModel(static_cast<AMDAIEDevice>(dev.getDevice()));
diff --git a/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt
index 3f14932d9..8dfe5a24c 100644
--- a/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/air/CMakeLists.txt
@@ -203,6 +203,27 @@ replace_string_in_file(
   ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp
   "memtileToSizeMap[t] = m.getTargetModel().getMemTileSize()"
   "memtileToSizeMap[t] = m.getTargetModel().getMemTileSize(t.getCol(), t.getRow())")
+replace_string_in_file(
+  ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp
+  "targetModel.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)"
+  "true")
+replace_string_in_file(
+  ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp
+  "device.getTargetModel().hasProperty(AIE::AIETargetModel::IsNPU)"
+  "true")
+replace_string_in_file(
+  ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIEPass.cpp
+  "AIE::getTargetModel(*device)"
+  "getDeviceModel(*device)")
+  
+replace_string_in_file(
+  ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIESchedulingUtils.cpp
+  "targetModel.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)"
+  "true")
+replace_string_in_file(
+  ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRToAIESchedulingUtils.cpp
+  "target_model.hasProperty(AIE::AIETargetModel::UsesSemaphoreLocks)"
+  "true")
 
 replace_string_in_file(
   ${IREE_MLIR_AIR_SOURCE_DIR}/lib/Conversion/AIRRtToNpuPass.cpp
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir
index 8e58250c0..44c6eba7c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/npu_instgen.mlir
@@ -43,6 +43,7 @@ module {
                          d1_stride = 7 : i32,
                          d1_size = 8 : i32,
                          d2_stride = 9 : i32,
+                         d2_size = 14 : i32,
                          ddr_id = 10 : i32,
                          iteration_current = 11 : i32,
                          iteration_stride = 12 : i32,
@@ -54,7 +55,14 @@ module {
                          lock_rel_val = 4 : i32,
                          next_bd = 5 : i32,
                          use_next_bd = 1 : i32,
-                         valid_bd = 1 : i32}
+                         valid_bd = 1 : i32,
+                         d0_zero_before = 0 : i32,
+                         d1_zero_before = 1 : i32,
+                         d2_zero_before = 2 : i32,
+                         d0_zero_after = 3 : i32,
+                         d1_zero_after = 4 : i32,
+                         d2_zero_after = 5 : i32
+                         }
 
       // CHECK: 00000000
       // CHECK: 00000000
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir
index 6130aa54b..e0eacc703 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_elementwise_pack_peel_objectfifo_e2e.mlir
@@ -13,7 +13,7 @@
 // CHECK-DAG:   aie.core(%[[TILE_0_3]])
 // CHECK-DAG:   aie.core(%[[TILE_1_3]])
 // CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
-// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 1)
 // CHECK-DAG:   aie.memtile_dma(%[[TILE_0_1]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_2]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_3]])
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
index 429089cdd..7b0b58026 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
@@ -1,6 +1,6 @@
 // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run.
 
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
 
 
 
@@ -20,7 +20,7 @@
 // CHECK:         aie.use_lock
 // Check a bit of the aiex.runtime_sequence:
 // CHECK:       aiex.runtime_sequence @matmul_i32()
-// CHECK:       } {npu_instructions = dense_resource<npu_instructions> : tensor<174xui32>, runtime_sequence_name = "matmul_i32"}
+// CHECK:       } {npu_instructions = dense_resource<npu_instructions> : tensor<208xui32>, runtime_sequence_name = "matmul_i32"}
 
 #pipeline_layout = #hal.pipeline.layout<bindings= [
     #hal.pipeline.binding<storage_buffer, ReadOnly>,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir
index 9229da0c3..b69322068 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_e2e.mlir
@@ -14,7 +14,7 @@
 // CHECK-DAG:   aie.core(%[[TILE_0_3]])
 // CHECK-DAG:   aie.core(%[[TILE_1_3]])
 // CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
-// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
+// CHECK-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 1)
 // CHECK-DAG:   aie.memtile_dma(%[[TILE_0_1]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_2]])
 // CHECK-DAG:   aie.mem(%[[TILE_0_3]])
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir
index 326a178e5..210b1ce99 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo_ukernel_e2e.mlir
@@ -15,7 +15,7 @@
 // PHOENIX-DAG:   aie.core(%[[TILE_0_3]])
 // PHOENIX-DAG:   aie.core(%[[TILE_1_3]])
 // PHOENIX-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
-// PHOENIX-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
+// PHOENIX-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 1)
 // PHOENIX-DAG:   aie.memtile_dma(%[[TILE_0_1]])
 // PHOENIX-DAG:   aie.mem(%[[TILE_0_2]])
 // PHOENIX-DAG:   aie.mem(%[[TILE_0_3]])
@@ -39,7 +39,7 @@
 // STRIX-DAG:   aie.core(%[[TILE_0_3]])
 // STRIX-DAG:   aie.core(%[[TILE_1_3]])
 // STRIX-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 0)
-// STRIX-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 1, 0)
+// STRIX-DAG:   aie.shim_dma_allocation {{.*}}(MM2S, 0, 1)
 // STRIX-DAG:   aie.memtile_dma(%[[TILE_0_1]])
 // STRIX-DAG:   aie.mem(%[[TILE_0_2]])
 // STRIX-DAG:   aie.mem(%[[TILE_0_3]])
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp
index 05f004545..1189306b6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp
@@ -4,10 +4,10 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Iterators.h"
 
 #define DEBUG_TYPE "iree-amdaie-access-to-acquire-release"
 
@@ -15,86 +15,111 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
+/// Some blocks have terminator ops, which must appear as the very last op in
+/// the block. If `block` has a terminator, set the insertion point of
+/// `rewriter` to just before the terminator, ready to create a new penultimate
+/// op in the block. Otherwise, set the insertion point to the very end of the
+/// block.
+void setInsertionToEnd(IRRewriter &rewriter, Block *block) {
+  if (block->back().hasTrait<OpTrait::IsTerminator>()) {
+    rewriter.setInsertionPoint(block->getTerminator());
+  } else {
+    rewriter.setInsertionPointToEnd(block);
+  }
+}
+
+llvm::MapVector<Value, SmallVector<AMDAIE::LogicalObjectFifoAccessOp>>
+getFifosToAccesses(AMDAIE::CoreOp coreOp, AMDAIE::MemoryAccess type) {
+  llvm::MapVector<Value, SmallVector<AMDAIE::LogicalObjectFifoAccessOp>>
+      accesses;
+  coreOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
+    if (accessOp.getAccessType() != type) return WalkResult::advance();
+    Value input = accessOp.getInput();
+    auto iter = accesses.find(input);
+    if (iter == accesses.end()) {
+      accesses.insert({input, {accessOp}});
+    } else {
+      iter->second.push_back(accessOp);
+    }
+    return WalkResult::advance();
+  });
+  return accesses;
+}
+
 /// Walk all read access operations within the core operations and insert
 /// semaphore acquire and release stubs. Acquire operations will be inserted
-/// at the location of the access operation and release operations will be
-/// inserted before the next access or at the end of the block.
+/// at the location of the access operation, and release operations will be
+/// inserted some time before the next read access.
 LogicalResult readAccessToAcquireRelease(Operation *parentOp) {
+  AMDAIE::MemoryAccess accessType = AMDAIE::MemoryAccess::Read;
+  AMDAIE::LogicalObjectFifoPort port = LogicalObjectFifoPort::Consume;
+
   IRRewriter rewriter(parentOp->getContext());
 
   SmallVector<AMDAIE::CoreOp> coreOps;
   parentOp->walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); });
 
-  // Map from DMA source/target logical objectFifos to those respective DMA
-  // operations.
-  DenseMap<Value, AMDAIE::ConnectionOp> logicalObjectFifoToDma;
+  // Map from the source and target amdaie.logicalobjectfifo values of
+  // amdaie.connections to the amdaie.connections themselves.
+  DenseMap<Value, AMDAIE::ConnectionOp> logicalObjectFifoToConnection;
   parentOp->walk([&](AMDAIE::ConnectionOp dmaOp) {
-    logicalObjectFifoToDma[dmaOp.getSource()] = dmaOp;
-    logicalObjectFifoToDma[dmaOp.getTarget()] = dmaOp;
+    logicalObjectFifoToConnection.insert({dmaOp.getSource(), dmaOp});
+    logicalObjectFifoToConnection.insert({dmaOp.getTarget(), dmaOp});
   });
 
   for (AMDAIE::CoreOp coreOp : coreOps) {
-    llvm::MapVector<Value, AMDAIE::LogicalObjectFifoAccessOp>
-        logicalObjectFifoToLastAccess;
-    WalkResult res =
-        coreOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
-          if (accessOp.getAccessType() != AMDAIE::MemoryAccess::Read)
-            return WalkResult::advance();
-
-          if (logicalObjectFifoToLastAccess.contains(accessOp.getInput())) {
-            rewriter.setInsertionPoint(accessOp);
-            rewriter.create<AMDAIE::LogicalObjectFifoRelease>(
-                rewriter.getUnknownLoc(),
-                logicalObjectFifoToDma[accessOp.getInput()].getResult(),
-                LogicalObjectFifoPort::Consume);
-          }
+    auto fifosToAccesses = getFifosToAccesses(coreOp, accessType);
 
-          if (!logicalObjectFifoToDma.contains(accessOp.getInput())) {
-            accessOp.emitOpError()
-                << "read access not found as source of DMA operation";
-            return WalkResult::interrupt();
-          }
-          rewriter.setInsertionPoint(accessOp);
-          auto acquireOp = rewriter.create<AMDAIE::LogicalObjectFifoAcquire>(
-              rewriter.getUnknownLoc(),
-              llvm::cast<LogicalObjectFifoType>(accessOp.getInput().getType()),
-              logicalObjectFifoToDma[accessOp.getInput()].getResult(),
-              LogicalObjectFifoPort::Consume);
-          auto newAccessOp = rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
-              rewriter.getUnknownLoc(), acquireOp.getResult(),
-              AMDAIE::MemoryAccess::Read);
-          rewriter.replaceAllUsesWith(accessOp.getResult(),
-                                      newAccessOp.getResult());
-          logicalObjectFifoToLastAccess[accessOp.getInput()] = accessOp;
-          return WalkResult::advance();
-        });
-    if (res.wasInterrupted()) return failure();
+    for (auto &&[logicalObjectFifo, accessOps] : fifosToAccesses) {
+      for (uint64_t i = 0; i < accessOps.size(); ++i) {
+        AMDAIE::LogicalObjectFifoAccessOp accessOp = accessOps[i];
 
-    // Insert release for remaining read access operations at end of block.
-    for (auto &&[value, accessOp] : logicalObjectFifoToLastAccess) {
-      Block *parentBlock = accessOp->getBlock();
-      if (!parentBlock->back().hasTrait<OpTrait::IsTerminator>()) {
-        rewriter.setInsertionPointToEnd(parentBlock);
-      } else {
-        rewriter.setInsertionPoint(parentBlock->getTerminator());
-      }
-      if (!logicalObjectFifoToDma.contains(accessOp.getInput())) {
-        accessOp.emitOpError()
-            << "read access not found as source of DMA operation";
-        return failure();
+        Value input = accessOp.getInput();
+        if (!logicalObjectFifoToConnection.contains(input)) {
+          return accessOp.emitOpError()
+                 << "does not have a connection in the logicalobjectfifo map";
+        }
+
+        // Insert the access op.
+        rewriter.setInsertionPoint(accessOp);
+        Block *block = accessOp->getBlock();
+        auto acquireOp = rewriter.create<AMDAIE::LogicalObjectFifoAcquire>(
+            rewriter.getUnknownLoc(),
+            llvm::cast<LogicalObjectFifoType>(input.getType()),
+            logicalObjectFifoToConnection[input].getResult(), port);
+        auto newAccessOp = rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+            rewriter.getUnknownLoc(), acquireOp.getResult(), accessType);
+        rewriter.replaceAllUsesWith(accessOp.getResult(),
+                                    newAccessOp.getResult());
+
+        // Insert the release op. The location of the release is as close to the
+        // following access op as possible, but always in the same block as the
+        // access op being released.
+        AMDAIE::LogicalObjectFifoAccessOp nextAccessOp;
+        if (i + 1 != accessOps.size()) nextAccessOp = accessOps[i + 1];
+        Operation *nextAccessOpsAncestor =
+            getAncestorInBlock(nextAccessOp, block);
+        if (nextAccessOpsAncestor &&
+            nextAccessOpsAncestor->getBlock() == block) {
+          rewriter.setInsertionPoint(nextAccessOpsAncestor);
+        } else {
+          setInsertionToEnd(rewriter, block);
+        }
+        rewriter.create<AMDAIE::LogicalObjectFifoRelease>(
+            rewriter.getUnknownLoc(),
+            logicalObjectFifoToConnection[input].getResult(), port);
       }
-      rewriter.create<AMDAIE::LogicalObjectFifoRelease>(
-          rewriter.getUnknownLoc(), logicalObjectFifoToDma[accessOp.getInput()],
-          LogicalObjectFifoPort::Consume);
     }
   }
   return success();
 }
 
 /// Walk all write access operations within the core operations and insert
-/// semaphore operations. Release operations will be inserted
-/// at the location of the access operation and acquire operations will be
-/// inserted after the preceding access or at the beginning of the block.
+/// semaphore operations. Release operations will be inserted at the location of
+/// the access operation and acquire operations will be inserted after the
+/// preceding access or at the beginning of the block. TODO(newling): update
+/// this to ensure that corresponding accesses and releases are in the same
+/// block, as in the case of `readAccessToAcquireRelease`.
 LogicalResult writeAccessToAcquireRelease(Operation *parentOp) {
   IRRewriter rewriter(parentOp->getContext());
 
@@ -214,7 +239,7 @@ class AMDAIEAccessToAcquireReleasePass
 
   AMDAIEAccessToAcquireReleasePass() = default;
   AMDAIEAccessToAcquireReleasePass(
-      const AMDAIEAccessToAcquireReleasePass &pass) {};
+      const AMDAIEAccessToAcquireReleasePass &pass){};
   void runOnOperation() override;
 };
 
@@ -225,6 +250,7 @@ void AMDAIEAccessToAcquireReleasePass::runOnOperation() {
                                "acquire-release semaphore stubs";
     return signalPassFailure();
   }
+
   if (failed(writeAccessToAcquireRelease(parentOp))) {
     parentOp->emitOpError() << "failed to convert write access operations to "
                                "acquire-release semaphore stubs";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp
index cb5e66424..cbbc2cf8b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAcquireReleaseToUseLock.cpp
@@ -9,9 +9,7 @@
 #include "iree-amd-aie/IR/AMDAIEOps.h"
 #include "iree-amd-aie/Transforms/AMDAIEOpUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
-#include "iree-amd-aie/Transforms/Transforms.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp
new file mode 100644
index 000000000..7b4acc0b1
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp
@@ -0,0 +1,429 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/aie_runtime/Utils/ChannelGenerator.h"
+#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-amdaie-assign-tiles"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+/// Return the tiles of the sources respectively targets of the users of this
+/// logical objectfifo, depending on whether the OperateOn template parameter is
+/// set to `OperateOn::Source` respectively `OperateOn::Target`.
+template <CopyOpOperateOn OperateOn>
+LogicalResult getUserTiles(
+    AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
+    SmallVectorImpl<AMDAIE::TileOp> &tiles) {
+  llvm::SmallSetVector<AMDAIE::TileOp, 16> tileSet;
+  for (Operation *user : logicalObjectFifo->getUsers()) {
+    if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(user)) {
+      ValueRange tileIndices;
+      if constexpr (OperateOn == CopyOpOperateOn::Source) {
+        if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue;
+        tileIndices = dmaOp.getSourceObjectFifo().getTiles();
+      } else if constexpr (OperateOn == CopyOpOperateOn::Target) {
+        if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue;
+        tileIndices = dmaOp.getTargetObjectFifo().getTiles();
+      }
+      // Only fill in tiles when all sources have tiles.
+      if (tileIndices.empty()) return failure();
+      for (Value index : tileIndices) {
+        tileSet.insert(
+            dyn_cast_if_present<AMDAIE::TileOp>(index.getDefiningOp()));
+      }
+    }
+  }
+  tiles = tileSet.takeVector();
+  return success();
+}
+
+/// Utility to recursively find users of the provided logical objectFifo inside
+/// `amdaie.core` operations and return the tile coordinates.
+LogicalResult findUsersInCoreAndAddTiles(
+    Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
+    llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> &tiles) {
+  for (Operation *userOp : op->getUsers()) {
+    if (auto coreOp = userOp->getParentOfType<AMDAIE::CoreOp>()) {
+      AMDAIE::TileOp tileOp = coreOp.getTileOp();
+      std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
+      std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
+      if (!column || !row)
+        return coreOp.emitOpError() << "has non-constant tile location";
+      tiles.insert(std::make_pair(column.value(), row.value()));
+    }
+    if (auto subviewOp = dyn_cast<memref::SubViewOp>(userOp)) {
+      return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles);
+    } else if (auto userLogicalObjectFifo =
+                   dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(userOp)) {
+      return findUsersInCoreAndAddTiles(userLogicalObjectFifo,
+                                        logicalObjectFifo, tiles);
+    }
+  }
+  return success();
+}
+
+/// Utility to clear non-local tile assignments.
+LogicalResult clearNonLocalTiles(RewriterBase &rewriter, Operation *op) {
+  op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp objFifo) {
+    if (objFifo.getMemorySpaceAsUInt() != 2) {
+      rewriter.setInsertionPoint(objFifo);
+      SmallVector<Value> tiles;
+      rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+          objFifo, cast<LogicalObjectFifoType>(objFifo.getOutput().getType()),
+          objFifo.getMemref(), tiles);
+    }
+  });
+  return success();
+}
+
+/// Utility to duplicate global objectFifos (L3) for each strided copy-like
+/// operation user to allow global logical objectFifos to be assigned to
+/// different tile locations.
+LogicalResult duplicateGlobalObjFifos(RewriterBase &rewriter, Operation *op) {
+  op->walk([&](AMDAIE::DoublyStridedCopyOpInterface copyOp) {
+    auto source = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+        copyOp.getSource().getDefiningOp());
+    auto target = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+        copyOp.getTarget().getDefiningOp());
+    auto createNewObjFifoAndReplaceUsesFrom =
+        [&](AMDAIE::LogicalObjectFifoFromMemrefOp oldObjFifo) {
+          rewriter.setInsertionPoint(copyOp);
+          auto newObjFifo =
+              rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+                  rewriter.getUnknownLoc(),
+                  cast<LogicalObjectFifoType>(oldObjFifo.getOutput().getType()),
+                  oldObjFifo.getMemref());
+          rewriter.replaceUsesWithIf(
+              oldObjFifo.getOutput(), newObjFifo.getOutput(),
+              [&](OpOperand &use) {
+                return use.getOwner() == copyOp.getOperation();
+              });
+        };
+    if (source && source.getMemorySpaceAsUInt() == 0) {
+      createNewObjFifoAndReplaceUsesFrom(source);
+    }
+    if (target && target.getMemorySpaceAsUInt() == 0) {
+      createNewObjFifoAndReplaceUsesFrom(target);
+    }
+  });
+  return success();
+}
+
+/// Assign tiles to the logical objectfifos with local memory space (L1).
+/// The tiles are derived from the usage of the logical objectfifos within
+/// core operations, which are already assigned a tile location.
+LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op) {
+  WalkResult res =
+      op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
+        Attribute memSpace = logicalObjectFifo.getMemorySpace();
+        if (!memSpace || dyn_cast<IntegerAttr>(memSpace).getInt() != 2)
+          return WalkResult::advance();
+
+        llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> tileLocations;
+        if (failed(findUsersInCoreAndAddTiles(
+                logicalObjectFifo, logicalObjectFifo, tileLocations))) {
+          return WalkResult::interrupt();
+        }
+        // Handle subviews.
+        for (Operation *userOp :
+             logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) {
+          if (auto subviewOp = dyn_cast<memref::SubViewOp>(userOp)) {
+            if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo,
+                                                  tileLocations))) {
+              return WalkResult::interrupt();
+            }
+          }
+        }
+
+        SmallVector<Value> tiles;
+        tiles.reserve(tileLocations.size());
+        rewriter.setInsertionPoint(logicalObjectFifo);
+        for (auto [column, row] : tileLocations) {
+          auto colIndex = rewriter.create<arith::ConstantIndexOp>(
+              rewriter.getUnknownLoc(), column);
+          auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
+              rewriter.getUnknownLoc(), row);
+          auto tileOp = rewriter.create<AMDAIE::TileOp>(
+              rewriter.getUnknownLoc(), colIndex, rowIndex);
+          tiles.push_back(tileOp.getResult());
+        }
+        // Sort for deterministic output IR.
+        llvm::sort(tiles.begin(), tiles.end(),
+                   AMDAIE::TileOp::tileValueColumnAndRowComparator);
+        rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            logicalObjectFifo,
+            cast<LogicalObjectFifoType>(
+                logicalObjectFifo.getOutput().getType()),
+            logicalObjectFifo.getMemref(), tiles);
+        return WalkResult::advance();
+      });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+/// Assign a set of candidate physical AIE tiles to logical objectFifos. This
+/// rewrite takes an iterative approach by matching logical objectfifos and only
+/// assigning tiles when linked through dma ops with other logical objectfifos
+/// which already have tiles assigned. If the linked logical objectfifos don't
+/// have tiles assigned yet, we will return a failure and give the linked
+/// logical objectfifos a chance to assign tiles before returning to this one.
+class FillTiles
+    : public OpRewritePattern<AMDAIE::LogicalObjectFifoFromMemrefOp> {
+  using OpRewritePattern<
+      AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern;
+
+ public:
+  FillTiles(MLIRContext *context, const AMDAIE::AMDAIEDeviceModel &deviceModel)
+      : OpRewritePattern(context), deviceModel(deviceModel) {}
+
+  LogicalResult matchAndRewrite(
+      AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
+      PatternRewriter &rewriter) const override {
+    LLVM_DEBUG(llvm::dbgs() << "FillTiles: " << logicalObjectFifo << "\n");
+    if (!logicalObjectFifo.getTiles().empty()) {
+      return rewriter.notifyMatchFailure(logicalObjectFifo,
+                                         "Tiles are already assigned.");
+    }
+    uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt();
+    if (memSpace != 0 && memSpace != 1) {
+      return rewriter.notifyMatchFailure(
+          logicalObjectFifo,
+          "Skip logical objFifos that don't operate on L3 or L2");
+    }
+
+    SmallVector<AMDAIE::TileOp, 16> targetTiles;
+    SmallVector<AMDAIE::TileOp, 16> sourceTiles;
+    LogicalResult dstRes =
+        getUserTiles<CopyOpOperateOn::Target>(logicalObjectFifo, targetTiles);
+    LogicalResult srcRes =
+        getUserTiles<CopyOpOperateOn::Source>(logicalObjectFifo, sourceTiles);
+    if (failed(dstRes) && failed(srcRes)) {
+      return rewriter.notifyMatchFailure(logicalObjectFifo,
+                                         "No source or target tiles found");
+    }
+
+    SmallVector<uint32_t> memSpaceRows = deviceModel.getMemSpaceRows(memSpace);
+    if (memSpaceRows.size() == 0) {
+      return rewriter.notifyMatchFailure(
+          logicalObjectFifo,
+          "No rows found for the memory space of this logical objFifo");
+    }
+    if (memSpaceRows.size() > 1) {
+      logicalObjectFifo.emitWarning()
+          << "has a memory space with multiple available rows, the first one "
+             "of which is chosen for tile assignment, but this might not lead "
+             "to good usage of the available resources.";
+    }
+    uint32_t row = memSpaceRows[0];
+    llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> tileLocations;
+    auto createTileLocations =
+        [&](SmallVector<AMDAIE::TileOp, 16> &tiles) -> LogicalResult {
+      // For deterministic and canonical output, sort on column index and erase
+      // duplicates.
+      std::sort(tiles.begin(), tiles.end(),
+                AMDAIE::TileOp::tileColumnComparator);
+      tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end());
+      for (AMDAIE::TileOp tile : tiles) {
+        std::optional<int64_t> column = getConstantIntValue(tile.getCol());
+        if (!column)
+          return rewriter.notifyMatchFailure(tile, "found non-constant column");
+        tileLocations.insert(std::make_pair(column.value(), row));
+      }
+      return success();
+    };
+
+    if (!targetTiles.empty() && !sourceTiles.empty()) {
+      return rewriter.notifyMatchFailure(
+          logicalObjectFifo,
+          "Found logical objectfifo with both source and target tiles, which "
+          "is not supported yet");
+    } else if (!targetTiles.empty()) {
+      // Create tile locations for this logical objectfifo based on the
+      // consumers' tiles.
+      if (failed(createTileLocations(targetTiles))) {
+        return rewriter.notifyMatchFailure(
+            logicalObjectFifo,
+            "Could not find tile locations based on the consumers' tiles.");
+      }
+    } else if (!sourceTiles.empty()) {
+      // Create tile locations for this logical objectfifo based on producers'
+      // tiles.
+      if (failed(createTileLocations(sourceTiles))) {
+        return rewriter.notifyMatchFailure(
+            logicalObjectFifo,
+            "Could not find tile locations based on the producers' tiles.");
+      }
+    } else {
+      return rewriter.notifyMatchFailure(
+          logicalObjectFifo,
+          "Don't assign this logicalObjectFifo to a physical tile (yet!). Wait "
+          "for other logical objectfifos to be assigned first.");
+    }
+
+    if (tileLocations.empty()) {
+      return rewriter.notifyMatchFailure(
+          logicalObjectFifo,
+          "No tile locations found for this logical objFifo. Maybe in a next "
+          "iteration, with more information, a tile location can be found.");
+    }
+    rewriter.setInsertionPoint(logicalObjectFifo);
+    rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+        logicalObjectFifo, logicalObjectFifo.getMemref(),
+        tileLocations.takeVector());
+    return success();
+  }
+
+ private:
+  // The device model used to retrieve device specific information.
+  const AMDAIEDeviceModel &deviceModel;
+};
+
+/// Assign tile locations to objectFifos. Start by searching for a set of
+/// candidate tile locations and then assign tiles based on a simple usage-based
+/// model that prioritizes tiles that have the least usage.
+LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op,
+                                  const AMDAIEDeviceModel &deviceModel) {
+  MLIRContext *context = rewriter.getContext();
+  if (failed(clearNonLocalTiles(rewriter, op)))
+    return op->emitOpError() << "failed to clear non-local tile assignemts";
+
+  // Find and fill the tile candidates.
+  RewritePatternSet fillTilePatterns(context);
+  fillTilePatterns.insert<FillTiles>(context, deviceModel);
+  if (failed(applyPatternsAndFoldGreedily(op, std::move(fillTilePatterns)))) {
+    return op->emitOpError()
+           << "collection of tile candidates for logical objectFifos failed";
+  }
+  if (failed(verify(op, true))) {
+    return failure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "After fillTiles: \n" << *op << "\n");
+
+  // Keep track of the buffer usage on tiles to try distributing buffers evenly
+  // over available tile resources.
+  DenseMap<TileLoc, size_t> tileLocToUsage;
+  auto tileLocAndUsageCmp = [&](AMDAIE::TileOp a, AMDAIE::TileOp b) -> bool {
+    int64_t colA = getConstantIndexOrAssert(a.getCol());
+    int64_t rowA = getConstantIndexOrAssert(a.getRow());
+    int64_t colB = getConstantIndexOrAssert(b.getCol());
+    int64_t rowB = getConstantIndexOrAssert(b.getRow());
+    size_t usageA = tileLocToUsage[TileLoc(colA, rowA)];
+    size_t usageB = tileLocToUsage[TileLoc(colB, rowB)];
+    if (usageA < usageB) return true;
+    if (usageA > usageB) return false;
+    if (colA < colB) return true;
+    if (colA > colB) return false;
+    if (rowA < rowB) return true;
+    if (rowA > rowB) return false;
+    assert(false && "same tiles should never be compared");
+  };
+
+  // After filling tile candidates, find and assign a specific one.
+  DenseMap<MemRefType, int64_t> logicalObjFifoToTileId;
+  WalkResult res =
+      op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
+        uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt();
+        if (memSpace != 0 && memSpace != 1) return WalkResult::advance();
+        if (logicalObjectFifo.getTiles().size() == 0) {
+          logicalObjectFifo.emitOpError()
+              << "should have at least one tile candidate";
+          return WalkResult::interrupt();
+        }
+
+        SmallVector<AMDAIE::TileOp> tiles =
+            llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) {
+              return dyn_cast_if_present<TileOp>(tile.getDefiningOp());
+            });
+        AMDAIE::TileOp assignedTileOp =
+            *std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp);
+
+        // Increase usage of the chosen tile as a new logical objectFifo will be
+        // assigned to it. This allows distributing the logical objectFifos
+        // evenly across the available tile resources.
+        int64_t col = getConstantIndexOrAssert(assignedTileOp.getCol());
+        int64_t row = getConstantIndexOrAssert(assignedTileOp.getRow());
+        tileLocToUsage[TileLoc(col, row)] += 1;
+
+        rewriter.setInsertionPoint(logicalObjectFifo);
+        SmallVector<Value> tileResults = {
+            cast<Value>(assignedTileOp.getResult())};
+        rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            logicalObjectFifo,
+            cast<LogicalObjectFifoType>(
+                logicalObjectFifo.getOutput().getType()),
+            logicalObjectFifo.getMemref(), tileResults);
+        return WalkResult::advance();
+      });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+namespace {
+
+class AMDAIEAssignTilesPass
+    : public impl::AMDAIEAssignTilesBase<AMDAIEAssignTilesPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+void AMDAIEAssignTilesPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+  IRRewriter rewriter(&getContext());
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
+  std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
+  if (!maybeDevice) {
+    parentOp->emitOpError()
+        << "has no AMDAIEDevice in the target attribute configuration. This "
+           "device-specific information is required to looking up column and "
+           "row related information, and must be attached to a containing "
+           "ModuleOp.";
+    return signalPassFailure();
+  }
+  AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value());
+
+  // Assign tile locations to logical objectFifos on local (L1) memory.
+  if (failed(assignLocalTiles(rewriter, parentOp))) {
+    parentOp->emitOpError() << "local tile assignment failed";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "After assignLocalTiles: \n" << *parentOp << "\n");
+
+  // Duplicate global objectFifos for each strided copy-like operation user to
+  // allow global logical objectFifos to be assigned to different tile
+  // locations.
+  if (failed(duplicateGlobalObjFifos(rewriter, parentOp))) {
+    parentOp->emitOpError() << "failed duplicating global object fifos";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "After duplicateGlobalObjFifos: \n"
+                          << *parentOp << "\n");
+
+  // Assign tile locations to logical objectFifos on non-local (not L1) memory.
+  if (failed(assignNonLocalTiles(rewriter, parentOp, deviceModel))) {
+    parentOp->emitOpError() << "local tile assignment failed";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "After assignNonLocalTiles: \n"
+                          << *parentOp << "\n");
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEAssignTilesPass() {
+  return std::make_unique<AMDAIEAssignTilesPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
index 2f2ee3297..e42b6d597 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEBufferizeToAllocation.cpp
@@ -8,7 +8,6 @@
 #include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/IR/Iterators.h"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp
new file mode 100644
index 000000000..008bf4124
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeForallToFor.cpp
@@ -0,0 +1,75 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the conversion of `scf.forall` within `amdaie.controlcode`
+// ops into `scf.for` operations. This can help discover new control code
+// optimization opportunities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+
+#define DEBUG_TYPE "iree-amdaie-controlcode-forall-to-for"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+/// Converts `scf.forall` operations found within the provided op into nested
+/// `scf.for` operations.
+LogicalResult forallToFor(RewriterBase &rewriter, Operation *op) {
+  WalkResult res = op->walk([&](scf::ForallOp forallOp) {
+    rewriter.setInsertionPoint(forallOp);
+    if (succeeded(forallOp.promoteIfSingleIteration(rewriter))) {
+      return WalkResult::advance();
+    }
+    if (failed(scf::forallToForLoop(rewriter, forallOp))) {
+      forallOp.emitOpError() << "was not transformed from `scf.forall` to `scf.for`";
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+class AMDAIEControlCodeForallToForPass
+    : public impl::AMDAIEControlCodeForallToForBase<
+          AMDAIEControlCodeForallToForPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<scf::SCFDialect, affine::AffineDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIEControlCodeForallToForPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+  IRRewriter rewriter(parentOp->getContext());
+  parentOp->walk([&](AMDAIE::ControlCodeOp controlCodeOp) {
+    if (failed(forallToFor(rewriter, controlCodeOp.getOperation()))) {
+      return signalPassFailure();
+    }
+    // Make sure to hoist `affine.apply` ops out of the innermost `scf.for` ops
+    // if applicable.
+    controlCodeOp->walk([&](affine::AffineApplyOp applyOp) {
+      (void)hoistForAffineApplyOp(rewriter, applyOp);
+    });
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEControlCodeForallToForPass() {
+  return std::make_unique<AMDAIEControlCodeForallToForPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
index 4ed6d0bb0..d92f23af9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
@@ -93,10 +93,10 @@ class TransactionBuilder {
     uint32_t addr =
         direction == AMDAIE::DMAChannelDir::MM2S ? 0x1D214 : 0x1D204;
     if (channel == 1) addr += 0x8;
-    if (col && row) {
-      addr |= ((col & 0xff) << colShift) | ((row & 0xff) << rowShift) |
-              (addr & 0xFFFFF);
-    }
+    // TODO(jornt): use aie-rt's transaction serializer instead to avoid these
+    // indiscrepancies between this file and aie-rt.
+    addr = ((col & 0xff) << colShift) | ((row & 0xff) << rowShift) |
+           (addr & 0xFFFFF);
     uint32_t value = 0;
     value |= bdId & 0xF;
     value |= (repeatCount & 0xFF) << 16;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
index dbd439458..26a269935 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree-amd-aie/Transforms/Transforms.h"
 #include "llvm/Support/Debug.h"
@@ -323,37 +324,6 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern<scf::ForOp> {
   }
 };
 
-/// Return the tiles of the sources respectively targets of the users of this
-/// logical objectfifo, depending on whether the OperateOn template parameter is
-/// set to `OperateOn::Source` respectively `OperateOn::Target`.
-template <CopyOpOperateOn OperateOn>
-LogicalResult getUserTiles(
-    AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
-    SmallVectorImpl<AMDAIE::TileOp> &tiles) {
-  llvm::SmallSetVector<AMDAIE::TileOp, 16> tileSet;
-  for (Operation *user : logicalObjectFifo->getUsers()) {
-    if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(user)) {
-      ValueRange tileIndices;
-      if constexpr (OperateOn == CopyOpOperateOn::Source) {
-        if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue;
-        tileIndices = dmaOp.getSourceObjectFifo().getTiles();
-      } else if constexpr (OperateOn == CopyOpOperateOn::Target) {
-        if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue;
-        tileIndices = dmaOp.getTargetObjectFifo().getTiles();
-      }
-
-      // Only fill in tiles when all sources have tiles.
-      if (tileIndices.empty()) return failure();
-      for (Value index : tileIndices) {
-        tileSet.insert(
-            dyn_cast_if_present<AMDAIE::TileOp>(index.getDefiningOp()));
-      }
-    }
-  }
-  tiles = tileSet.takeVector();
-  return success();
-}
-
 /// Insert `amdaie.logicalobjectfifo.access` operations which retrieve the
 /// memrefs from logical objectfifos and update the computational operations to
 /// operate on these local memrefs.
@@ -454,229 +424,6 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
   return success();
 }
 
-/// Utility to recursively find users of the provided logical objectFifo inside
-/// `amdaie.core` operations and return the tile coordinates.
-LogicalResult findUsersInCoreAndAddTiles(
-    Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
-    llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> &tiles) {
-  for (Operation *userOp : op->getUsers()) {
-    if (auto coreOp = userOp->getParentOfType<AMDAIE::CoreOp>()) {
-      AMDAIE::TileOp tileOp = coreOp.getTileOp();
-      std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
-      std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
-      if (!column || !row) {
-        return coreOp.emitOpError() << "has non-constant tile location";
-      }
-      tiles.insert(std::make_pair(column.value(), row.value()));
-    }
-    if (auto subviewOp = dyn_cast<memref::SubViewOp>(userOp)) {
-      return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles);
-    } else if (auto userLogicalObjectFifo =
-                   dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(userOp)) {
-      return findUsersInCoreAndAddTiles(userLogicalObjectFifo,
-                                        logicalObjectFifo, tiles);
-    }
-  }
-  return success();
-}
-
-/// Assign tiles to the logical objectfifos with local memory space (L1).
-/// The tiles are derived from the usage of the logical objectfifos within
-/// core operations, which are already assigned a tile location.
-LogicalResult assignLocalAieTiles(ModuleOp moduleOp) {
-  IRRewriter rewriter(moduleOp.getContext());
-
-  WalkResult res = moduleOp->walk(
-      [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
-        Attribute memSpace = logicalObjectFifo.getMemorySpace();
-        if (!memSpace || dyn_cast<IntegerAttr>(memSpace).getInt() != 2)
-          return WalkResult::advance();
-
-        llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> tileLocations;
-        if (failed(findUsersInCoreAndAddTiles(
-                logicalObjectFifo, logicalObjectFifo, tileLocations))) {
-          return WalkResult::interrupt();
-        }
-        // Handle subviews.
-        for (Operation *userOp :
-             logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) {
-          if (auto subviewOp = dyn_cast<memref::SubViewOp>(userOp)) {
-            if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo,
-                                                  tileLocations))) {
-              return WalkResult::interrupt();
-            }
-          }
-        }
-
-        SmallVector<Value> tiles;
-        tiles.reserve(tileLocations.size());
-        rewriter.setInsertionPoint(logicalObjectFifo);
-        for (auto [column, row] : tileLocations) {
-          auto colIndex = rewriter.create<arith::ConstantIndexOp>(
-              rewriter.getUnknownLoc(), column);
-          auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
-              rewriter.getUnknownLoc(), row);
-          auto tileOp = rewriter.create<AMDAIE::TileOp>(
-              rewriter.getUnknownLoc(), colIndex, rowIndex);
-          tiles.push_back(tileOp.getResult());
-        }
-        // Sort for deterministic output IR.
-        llvm::sort(tiles.begin(), tiles.end(),
-                   AMDAIE::TileOp::tileValueColumnAndRowComparator);
-        rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-            logicalObjectFifo,
-            cast<LogicalObjectFifoType>(
-                logicalObjectFifo.getOutput().getType()),
-            logicalObjectFifo.getMemref(), tiles);
-        return WalkResult::advance();
-      });
-  if (res.wasInterrupted()) return failure();
-  return success();
-}
-
-/// Assign a set of potential physical AIE tiles to logical objectFifos. This
-/// rewrite takes an iterative approach by matching logical objectfifos and only
-/// assigning tiles when linked through dma ops with other logical objectfifos
-/// which already have tiles assigned. If the linked logical objectfifos don't
-/// have tiles assigned yet, we will return a failure and give the linked
-/// logical objectfifos a chance to assign tiles before returning to this one.
-///
-/// TODO(jornt): There are decisions being made in this pass on which tiles to
-/// assign to a logical objectfifo. This logic is very simple for now and tries
-/// to use the tiles in the same columns as targets and sources. At some point,
-/// we probably need some AIE device model to guide the assignement here for
-/// performance and to avoid hardware resource issues later on.
-class FillAieTiles
-    : public OpRewritePattern<AMDAIE::LogicalObjectFifoFromMemrefOp> {
-  using OpRewritePattern<
-      AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(
-      AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
-      PatternRewriter &rewriter) const override {
-    LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n");
-    if (!logicalObjectFifo.getTiles().empty()) {
-      return failure();
-    }
-
-    Attribute memSpace = logicalObjectFifo.getMemorySpace();
-    // Skip logical objectfifos within local memory as they should already be
-    // assigned.
-    if (memSpace && dyn_cast<IntegerAttr>(memSpace).getInt() == 2) {
-      if (logicalObjectFifo.getTiles().empty()) {
-        logicalObjectFifo.emitOpError()
-            << "found logical objectfifo on local memory space with no tiles "
-               "assigned.";
-      }
-      return failure();
-    }
-    // HandLe both L3/shim and L2/Memtiles.
-    // Skip logical objectfifos within non-global and non-shared memory.
-    if (memSpace && dyn_cast<IntegerAttr>(memSpace).getInt() != 1) {
-      return logicalObjectFifo.emitOpError()
-             << "found logical objectfifo with unknown memory space";
-    }
-
-    SmallVector<AMDAIE::TileOp, 16> targetTiles;
-    SmallVector<AMDAIE::TileOp, 16> sourceTiles;
-    LogicalResult dstRes =
-        getUserTiles<CopyOpOperateOn::Target>(logicalObjectFifo, targetTiles);
-    LogicalResult srcRes =
-        getUserTiles<CopyOpOperateOn::Source>(logicalObjectFifo, sourceTiles);
-
-    // If no source and target tiles found, skip.
-    if (failed(dstRes) && failed(srcRes)) {
-      return failure();
-    }
-
-    // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie
-    // target model for this.
-    int64_t rowInt = memSpace ? 1 : 0;
-    llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> tileLocations;
-    auto createTileLocations =
-        [&](SmallVector<AMDAIE::TileOp, 16> &tiles) -> LogicalResult {
-      // TODO(jornt): For now, for deterministic behaviour, sort on column
-      // index and use first one. This needs to be generalized to assign
-      // tiles based on a resource model.
-      std::sort(tiles.begin(), tiles.end(),
-                AMDAIE::TileOp::tileColumnComparator);
-      // Erase duplicates.
-      tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end());
-      for (AMDAIE::TileOp tile : tiles) {
-        std::optional<int64_t> column = getConstantIntValue(tile.getCol());
-        if (!column) return tile.emitOpError() << "found non-constant column";
-        tileLocations.insert(std::make_pair(column.value(), rowInt));
-      }
-      return success();
-    };
-
-    if (!targetTiles.empty() && !sourceTiles.empty()) {
-      return logicalObjectFifo.emitOpError()
-             << "found logical objectfifo with both source and target tiles, "
-                "which is not supported yet";
-    } else if (!targetTiles.empty()) {
-      // Create tile locations for this logical objectfifo based on target
-      // tiles.
-      if (failed(createTileLocations(targetTiles))) {
-        return failure();
-      }
-    } else if (!sourceTiles.empty()) {
-      // Create tile locations for this logical objectfifo based on source
-      // tiles.
-      if (failed(createTileLocations(sourceTiles))) {
-        return failure();
-      }
-    } else {
-      // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait
-      // for other logical objectfifos to be assigned first.
-      return failure();
-    }
-
-    // If no tile results, skip, and maybe in a next iteration another tile will
-    // be found.
-    if (tileLocations.empty()) {
-      return failure();
-    }
-
-    rewriter.setInsertionPoint(logicalObjectFifo);
-    rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-        logicalObjectFifo, logicalObjectFifo.getMemref(),
-        tileLocations.takeVector());
-    return success();
-  }
-};
-
-/// Assign specific tile locations to objectFifos, starting from the set of
-/// potential tile locations filled in earlier.
-LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) {
-  IRRewriter rewriter(moduleOp.getContext());
-
-  moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
-    Attribute memSpace = logicalObjectFifo.getMemorySpace();
-    if (memSpace && dyn_cast<IntegerAttr>(memSpace).getInt() != 1)
-      return WalkResult::advance();
-
-    SmallVector<AMDAIE::TileOp> tiles =
-        llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) {
-          return dyn_cast_if_present<TileOp>(tile.getDefiningOp());
-        });
-    llvm::sort(tiles.begin(), tiles.end(),
-               AMDAIE::TileOp::tileColumnComparator);
-
-    // For now, use first tile in sorted list.
-    // TODO(jornt): This will need to become more complex in the future to
-    // account for potential hardware limitations and constraints.
-    SmallVector<Value> tileResults = {cast<Value>(tiles[0].getResult())};
-    rewriter.setInsertionPoint(logicalObjectFifo);
-    rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-        logicalObjectFifo,
-        cast<LogicalObjectFifoType>(logicalObjectFifo.getOutput().getType()),
-        logicalObjectFifo.getMemref(), tileResults);
-    return WalkResult::advance();
-  });
-  return success();
-}
-
 class AMDAIEDistributeCoresAndObjectFifosPass
     : public impl::AMDAIEDistributeCoresAndObjectFifosBase<
           AMDAIEDistributeCoresAndObjectFifosPass> {
@@ -694,6 +441,17 @@ class AMDAIEDistributeCoresAndObjectFifosPass
 void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() {
   MLIRContext *context = &getContext();
   ModuleOp moduleOp = getOperation();
+  IRRewriter rewriter(moduleOp.getContext());
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(moduleOp);
+  std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
+  if (!maybeDevice) {
+    moduleOp->emitOpError()
+        << "has no AMDAIEDevice in the target attribute configuration. This "
+           "device-specific information is required for tile assignment "
+           "purposes, and must be attached to a containing ModuleOp.";
+    return signalPassFailure();
+  }
+  AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value());
 
   // Convert local scf.forall operations selected for parallel distribution to
   // nested scf.for operations.
@@ -750,7 +508,7 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() {
                           << moduleOp << "\n");
 
   // Assign tile locations to logical objectfifos on local (L1) memory.
-  if (failed(assignLocalAieTiles(moduleOp))) {
+  if (failed(assignLocalTiles(rewriter, moduleOp))) {
     moduleOp.emitOpError() << "local tile assignment failed";
     return signalPassFailure();
   }
@@ -759,40 +517,21 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() {
     return signalPassFailure();
   }
 
-  LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n"
+  LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalTiles: \n"
                           << moduleOp << "\n");
 
-  // Assign a set of potential tile locations to the remaining logical
-  // objectFifos.
-  RewritePatternSet assignAieTilePatters(context);
-  assignAieTilePatters.insert<FillAieTiles>(context);
-  if (failed(applyPatternsAndFoldGreedily(moduleOp,
-                                          std::move(assignAieTilePatters)))) {
-    moduleOp.emitOpError()
-        << "collection of tile candidates for logical objectFifos failed";
+  // Assign tile locations to logical objectfifos on non-local (not L1) memory.
+  if (failed(assignNonLocalTiles(rewriter, moduleOp, deviceModel))) {
+    moduleOp.emitOpError() << "local tile assignment failed";
     return signalPassFailure();
   }
 
   if (failed(verify(moduleOp, true))) {
     return signalPassFailure();
   }
-  LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n"
-                          << moduleOp << "\n");
-
-  // Assign specific tile locations to objectFifos, starting from the set of
-  // potential tile locations filled in earlier.
-  if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) {
-    moduleOp.emitOpError()
-        << "tile assignment and logical objectFifo distribution failed";
-    return signalPassFailure();
-  }
 
-  if (failed(verify(moduleOp, true))) {
-    return signalPassFailure();
-  }
-  LLVM_DEBUG(llvm::dbgs()
-             << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n"
-             << moduleOp << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "Module after assignNonLocalTiles: \n"
+                          << moduleOp << "\n");
 }
 
 }  // namespace
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
index 710d0ddfb..0af3e8b32 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeL1Allocations.cpp
@@ -44,7 +44,10 @@ FailureOr<DenseSet<Value>> getThreadIndVars(ModuleOp moduleOp) {
 }
 
 /// Try to detect subview(s) that look like they're 'distributing' L1 memory.
-/// That is: they slice the L1 memory along thread/tile dimensions.
+/// That is: they slice the L1 memory along thread/tile dimensions. If the
+/// allocation `alloc` does not look like it's distributed across threads/tiles,
+/// return an empty memref type. Otherwise, return the memref type that the
+/// subviews are viewing.
 MemRefType getDistributedType(memref::AllocOp alloc,
                               const DenseSet<Value> &indVars) {
   MemRefType type;
@@ -54,12 +57,17 @@ MemRefType getDistributedType(memref::AllocOp alloc,
       // that if a subview has an offset which is not a constant and not a
       // thread id, it's not 'distributing'.
       Operation::operand_range offsets = subview.getOffsets();
+      int nIndVars{0};
       for (Value offset : offsets) {
         bool isConst = matchPattern(offset, m_Constant());
         bool isIndVar = llvm::is_contained(indVars, offset);
+        nIndVars += isIndVar;
         if (!isConst && !isIndVar) return {};
       }
 
+      // If there are no thread ids, this subview is not distributing.
+      if (nIndVars == 0) return {};
+
       auto nextType = cast<MemRefType>(subview.getResult().getType());
       if (!type) {
         type = nextType;
@@ -95,7 +103,8 @@ LogicalResult distributeLocalMemory(ModuleOp moduleOp) {
   if (failed(maybeIndVars)) return failure();
   const DenseSet<Value> &indVars = maybeIndVars.value();
   IRRewriter rewriter(moduleOp.getContext());
-  moduleOp->walk([&](memref::AllocOp oldAlloc) {
+  auto allocWalkResult = moduleOp->walk([&](memref::AllocOp oldAlloc)
+                                            -> WalkResult {
     // Only consider local memory (L1).
     Attribute maybeMemorySpace = oldAlloc.getType().getMemorySpace();
     if (!maybeMemorySpace) return WalkResult::advance();
@@ -173,8 +182,8 @@ LogicalResult distributeLocalMemory(ModuleOp moduleOp) {
                 return success();
               })
               .Default([&](Operation *user) {
-                user->emitOpError("needs logic implemented for handling.");
-                return failure();
+                return user->emitOpError(
+                    "needs logic implemented for handling.");
               });
 
       if (failed(switchResult)) return WalkResult::interrupt();
@@ -183,6 +192,8 @@ LogicalResult distributeLocalMemory(ModuleOp moduleOp) {
     return WalkResult::advance();
   });
 
+  if (allocWalkResult.wasInterrupted()) return failure();
+
   return success();
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp
index 3ce342bc8..7f31b9a78 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp
@@ -6,23 +6,13 @@
 
 #include "AMDAIEDmaUtils.h"
 
-#include <cstdlib>
-
+#include "AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 
 namespace mlir::iree_compiler::AMDAIE {
 
-/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor.
-Operation *getAncestorInBlock(Operation *op, Block *block) {
-  if (!op || !block) return nullptr;
-  auto parent = op;
-  while (parent && (parent->getBlock() != block))
-    parent = parent->getParentOp();
-  return parent;
-}
-
 bool areAccessPatternsCombinable(const SmallVector<OpFoldResult> &offsetsA,
                                  const SmallVector<OpFoldResult> &sizesA,
                                  const SmallVector<OpFoldResult> &stridesA,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
index 09cbad45c..02b5ff597 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -8,31 +8,27 @@
 
 #include <numeric>
 
-#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/AMDAIEUtils.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Iterators.h"
 #include "mlir/IR/Operation.h"
 
 #define DEBUG_TYPE "iree-amdaie-logicalobjfifo-splitting-utils"
 
 namespace mlir::iree_compiler::AMDAIE {
 
-/// Utility to create a new logical objectfifo based on shape defined by
-/// `newSizesOpFoldResultArr`.
+/// Hardcoded the transposed dimensions of L2 target dma for now.
+/// The values are based on the results from ConvertToDma with option as
+/// transposed on target, e.g., dma size [1, 1, 32, 32] -> [1, 32, 1, 32].
+const static SmallVector<size_t> transposedL2Dims = {0, 2, 1, 3};
+
+/// Utility to create a new logical objectfifo.
 static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo(
     IRRewriter &rewriter,
-    AMDAIE::LogicalObjectFifoFromMemrefOp &oldLogicalObjectFifo,
-    SmallVectorImpl<OpFoldResult> &newSizesOpFoldResultArr) {
+    AMDAIE::LogicalObjectFifoFromMemrefOp oldLogicalObjectFifo,
+    ArrayRef<int64_t> newSizes) {
   OpBuilder::InsertionGuard guard(rewriter);
-  SmallVector<int64_t> newSizes = llvm::map_to_vector(
-      newSizesOpFoldResultArr,
-      [](OpFoldResult sizeVal) { return getConstantIndexOrAssert(sizeVal); });
   Value oldAllocOp = oldLogicalObjectFifo.getMemref();
   auto oldMemRefType = cast<MemRefType>(oldAllocOp.getType());
   MemRefType newAllocType = MemRefType::get(
@@ -55,22 +51,17 @@ static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo(
   return newLogicalObjectFifo;
 }
 
-/// Utility to help fetch those input DmaCpyNd Ops which needs to be split.
-SmallVector<AMDAIE::DmaCpyNdOp> fetchDmaCpyNdOpsToSplitOrCombine(
-    Operation *op) {
-  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
-  // We are currently walking through CoreOps gathering 3rd Input DmaOp (if
-  // applicable) from them.
-  // TODO(avarma): We will generalize this later.
-  op->walk([&](AMDAIE::CoreOp coreOp) {
-    SmallVector<Value> inputDmas = coreOp.getInputDmas();
-    if (inputDmas.size() != 3) return WalkResult::skip();
-    auto dmaCpyNdOp = inputDmas[2].getDefiningOp<AMDAIE::DmaCpyNdOp>();
-    assert(dmaCpyNdOp && "expected an amdaie.dma_cpy_nd op");
-    l2ToL1DmaOps.push_back(dmaCpyNdOp);
-    return WalkResult::advance();
-  });
-  return l2ToL1DmaOps;
+/// Utility to create a new logical objectfifo based on shape defined by
+/// `newSizesOpFoldResultArr`.
+static AMDAIE::LogicalObjectFifoFromMemrefOp createNewLogicalObjectFifo(
+    IRRewriter &rewriter,
+    AMDAIE::LogicalObjectFifoFromMemrefOp oldLogicalObjectFifo,
+    ArrayRef<OpFoldResult> newSizesOpFoldResultArr) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  SmallVector<int64_t> newSizes = llvm::map_to_vector(
+      newSizesOpFoldResultArr,
+      [](OpFoldResult sizeVal) { return getConstantIndexOrAssert(sizeVal); });
+  return createNewLogicalObjectFifo(rewriter, oldLogicalObjectFifo, newSizes);
 }
 
 /// Utility to verify that the split dimensions for L2 are contiguous.
@@ -123,15 +114,14 @@ static LogicalResult checkIsRangeFromZero(
 ///       . .|. .|
 ///       . .|. .|
 ///          -----
-static FailureOr<OpFoldResult> updateL3SourceOffset(IRRewriter &rewriter,
-                                                    OpFoldResult oldL3Offset,
-                                                    int64_t offsetToAdd,
-                                                    MLIRContext *context) {
+static FailureOr<OpFoldResult> addToOffset(IRRewriter &rewriter,
+                                           OpFoldResult oldL3Offset,
+                                           int64_t offsetToAdd) {
   auto createAffineMap = [&](AffineExpr affineExpr,
                              int64_t offsetToAdd) -> AffineMap {
     AffineExpr newAffineExpr = affineExpr + offsetToAdd;
     return AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, {newAffineExpr},
-                          context);
+                          rewriter.getContext());
   };
   OpFoldResult newL3AsSourceOffset;
   OpBuilder::InsertionGuard guard(rewriter);
@@ -305,17 +295,41 @@ static LogicalResult checkWhetherSplitIsPossible(
   return success();
 }
 
-// Given a vector of L2->L1 Dma ops' perform the splitting :-
-// 1. Check if the splitting can be performed or not. If not possible, bail out.
-// 2. For the split dimension inferred set offset = 0 and size as 1 for L2 and
-//    L3.
-// 3. Now traverse each L2->L1 Dma op and perform the following :-
-//    a) Create a new L2 AllocOp based on the updated size (step 3 above) and
-//       create a logicalobjectfifo using the same.
-//    b) Split L3->L2 Dma op.
-//    c) SPlit L2->L1 Dma op.
-// 4. Delete old L2->L1, L3->L2 and corresponding AllocOps.
-LogicalResult splitLogicalObjectFifos(
+/// Utility to determine if the strides of a dma copy operation might describe
+/// a transposition of dimensions. Here we are only considering static strides.
+/// If any of the static strides are in non-decreasing order from right to left,
+/// then this might be a transpose.
+static FailureOr<bool> isMaybeTransposed(Location loc,
+                                         ArrayRef<OpFoldResult> strides) {
+  std::optional<SmallVector<int64_t>> maybeStrides =
+      getConstantIntValues(strides);
+  if (!maybeStrides) {
+    emitError(loc) << "expected static L2 strides";
+    return failure();
+  }
+  SmallVector<int64_t> staticStrides = maybeStrides.value();
+  return !std::is_sorted(staticStrides.rbegin(), staticStrides.rend());
+}
+
+static FailureOr<bool> isDmaTransposedOnSourceSide(AMDAIE::DmaCpyNdOp dmaOp) {
+  return isMaybeTransposed(dmaOp->getLoc(), dmaOp.getSourceMixedStrides());
+}
+
+static FailureOr<bool> isDmaTransposedOnTargetSide(AMDAIE::DmaCpyNdOp dmaOp) {
+  return isMaybeTransposed(dmaOp->getLoc(), dmaOp.getTargetMixedStrides());
+}
+
+/// Given a vector of L2->L1 Dma ops' perform the splitting :-
+/// 1. Check if the splitting can be performed. If it can't, bail out.
+/// 2. For the split dimension inferred set offset = 0 and size as 1 for L2 and
+///    L3.
+/// 3. Now traverse each L2->L1 Dma op and perform the following :-
+///    a) Create a new L2 AllocOp based on the updated size (step 2 above) and
+///       create a logicalobjectfifo using the same.
+///    b) Split L3->L2 Dma op.
+///    c) Split L2->L1 Dma op.
+/// 4. Delete old L2->L1, L3->L2 and corresponding AllocOps.
+LogicalResult splitLogicalObjectFifoForElementwiseOp(
     IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
     MLIRContext *context) {
   SplittingLogicalObjectFifoData splittingLogicalObjectFifoData;
@@ -351,31 +365,14 @@ LogicalResult splitLogicalObjectFifos(
   SmallVector<OpFoldResult> staticL3AsSourceSizes =
       l3ToL2DmaOp.getSourceMixedSizes();
 
-  LogicalObjectFifoFromMemrefOp l2TargetObjectFifo =
-      l3ToL2DmaOp.getTargetObjectFifo();
-  ArrayRef<int64_t> l2TargetShape =
-      l2TargetObjectFifo.getMemrefType().getShape();
-  if (l2TargetShape.size() != staticL2AsTargetSizes.size()) {
-    LLVM_DEBUG(llvm::dbgs() << "L2 target size should be the same");
-    return failure();
-  }
-
-  // Check if the L3->L2 dma is transposed on the target side.
-  bool dmaTransposeOnSource = true;
-  for (auto [s1, s2] : llvm::zip_equal(l2TargetShape, staticL2AsTargetSizes)) {
-    if (s1 != getConstantIntValue(s2)) {
-      dmaTransposeOnSource = false;
-      break;
-    }
-  }
-  if (staticL3AsSourceSizes.size() != staticL2AsTargetSizes.size()) {
-    dmaTransposeOnSource = false;
-  }
-
   OpFoldResult zeroVal = getAsIndexOpFoldResult(context, 0);
   OpFoldResult oneVal = getAsIndexOpFoldResult(context, 1);
 
-  if (dmaTransposeOnSource) {
+  FailureOr<bool> maybeTransposed = isDmaTransposedOnTargetSide(l3ToL2DmaOp);
+  if (failed(maybeTransposed)) return failure();
+  bool dmaTransposeOnTarget = maybeTransposed.value();
+
+  if (!dmaTransposeOnTarget) {
     // Update split dimensions' offset/size for L2 as target and L3 as source.
     // We can afford to do this here because it's going to be the same for all
     // L3->L2 splits. Here we are setting offset = 0 and size = 1.
@@ -389,14 +386,12 @@ LogicalResult splitLogicalObjectFifos(
     // The L2 target side has transposed dimensions, while the L3 source side
     // data are continuous and don't have `nonSplitDim`. Then the L3 source
     // sizes need to be modified to match the new L2 target sizes.
-    // Hardcoded the transposed dimensions for now.
-    const SmallVector<size_t> transposeDim = {0, 2, 1, 3};
     for (auto &&[splitDim, nonSplitdim] :
          llvm::zip_equal(splitDimsForL2, nonSplitDimsForL2)) {
-      staticL2AsTargetOffsets[transposeDim[splitDim]] = zeroVal;
-      staticL2AsTargetSizes[transposeDim[splitDim]] = oneVal;
+      staticL2AsTargetOffsets[transposedL2Dims[splitDim]] = zeroVal;
+      staticL2AsTargetSizes[transposedL2Dims[splitDim]] = oneVal;
       staticL3AsSourceSizes[splitDim] =
-          staticL2AsTargetSizes[transposeDim[nonSplitdim]];
+          staticL2AsTargetSizes[transposedL2Dims[nonSplitdim]];
     }
   }
 
@@ -414,7 +409,7 @@ LogicalResult splitLogicalObjectFifos(
     // If the dma transpose is on the source(target) side, then the L2
     // target(source) side has the sizes in order.
     SmallVector<OpFoldResult> newL2Sizes =
-        dmaTransposeOnSource ? staticL2AsTargetSizes : staticL2AsSourceSizes;
+        dmaTransposeOnTarget ? staticL2AsSourceSizes : staticL2AsTargetSizes;
     AMDAIE::LogicalObjectFifoFromMemrefOp source =
         createNewLogicalObjectFifo(rewriter, oldL2ObjectFifo, newL2Sizes);
 
@@ -422,7 +417,7 @@ LogicalResult splitLogicalObjectFifos(
     // ---------- L3 -> L2 splitting --------------
     // --------------------------------------------
     // Update L3 source offsets for non-split dimensions. Refer doc comment of
-    // `updateL3SourceOffset` for the computation rationale involved.
+    // `addToOffset` for the computation rationale involved.
     SmallVector<OpFoldResult> staticL3AsSourceOffsets =
         l3ToL2DmaOp.getSourceMixedOffsets();
     for (auto &&[splitDim, nonSplitdim] :
@@ -445,9 +440,9 @@ LogicalResult splitLogicalObjectFifos(
 
       // If the dma transpose is on the target side, L3 source side data are
       // continuous and don't have `nonSplitDim`.
-      size_t dim = dmaTransposeOnSource ? nonSplitdim : splitDim;
-      FailureOr<OpFoldResult> newOffset = updateL3SourceOffset(
-          rewriter, staticL3AsSourceOffsets[dim], offsetToAdd, context);
+      size_t dim = dmaTransposeOnTarget ? splitDim : nonSplitdim;
+      FailureOr<OpFoldResult> newOffset =
+          addToOffset(rewriter, staticL3AsSourceOffsets[dim], offsetToAdd);
       if (failed(newOffset)) {
         // TODO: Ideally we should be able to handle even +, -, *, /, etc.
         //       But handle this later (if at all!) as such cases might not
@@ -507,4 +502,238 @@ LogicalResult splitLogicalObjectFifos(
   return success();
 }
 
+/// Utility to get the `DmaCpyNdOp` producers and consumers of a given
+/// objectFifo op.
+LogicalResult getDmaCpyNdOpProducersAndConsumers(
+    AMDAIE::LogicalObjectFifoFromMemrefOp op,
+    SmallVector<AMDAIE::DmaCpyNdOp> &producers,
+    SmallVector<AMDAIE::DmaCpyNdOp> &consumers) {
+  for (Operation *userOp : op->getUsers()) {
+    if (auto stridedCopyOp = dyn_cast<AMDAIE::DmaCpyNdOp>(userOp)) {
+      if (dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+              stridedCopyOp.getTarget().getDefiningOp()) == op) {
+        producers.push_back(stridedCopyOp);
+      } else if (dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+                     stridedCopyOp.getSource().getDefiningOp()) == op) {
+        consumers.push_back(stridedCopyOp);
+      } else {
+        return op.emitOpError()
+               << "has non-consumer, non-producer doubly strided copy op user";
+      }
+    } else {
+      return op.emitOpError() << "has non-doubly strided copy op user";
+    }
+  }
+  return success();
+}
+
+using OffsetIndexAndNewOffsetT = std::tuple<std::optional<size_t>, int64_t>;
+
+/// Utility to return the index of the offsets array that refers to newly
+/// splitted objectFifo and the respective offset value. Note that there might
+/// not be a dimension with `stride == sizeAfterSplit`, in which case an offset
+/// index can't be returned and the correct offset is `0`.
+FailureOr<OffsetIndexAndNewOffsetT> getOffsetIndexAndOffset(
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+    ArrayRef<OpFoldResult> strides, size_t sizeAfterSplit,
+    function_ref<InFlightDiagnostic()> emitError) {
+  SmallVector<size_t> offsetIndices;
+  for (auto iter : llvm::enumerate(llvm::zip(strides, offsets))) {
+    std::optional<int64_t> maybeStride =
+        getConstantIntValue(std::get<0>(iter.value()));
+    std::optional<int64_t> maybeOffset =
+        getConstantIntValue(std::get<1>(iter.value()));
+    if (maybeStride.has_value() && maybeOffset.has_value() &&
+        maybeStride.value() == sizeAfterSplit && maybeOffset.value() != 0) {
+      offsetIndices.push_back(iter.index());
+    }
+  }
+  if (offsetIndices.size() > 1)
+    return emitError() << "multiple offset indices found";
+  int64_t size{1};
+  int64_t offset{0};
+  std::optional<size_t> maybeOffsetIdx;
+  if (offsetIndices.size() == 1) {
+    size_t offsetIdx = offsetIndices[0];
+    maybeOffsetIdx = offsetIdx;
+    std::optional<int64_t> maybeSize = getConstantIntValue(sizes[offsetIdx]);
+    std::optional<int64_t> maybeOffset =
+        getConstantIntValue(offsets[offsetIdx]);
+    if (!maybeSize || !maybeOffset) {
+      return emitError()
+             << "expected a static target offset and size on index: "
+             << offsetIdx;
+    }
+    size = maybeSize.value();
+    offset = maybeOffset.value();
+  }
+  if (size != 1) {
+    return emitError() << "only a static size of 1 is currently "
+                          "supported on the split index";
+  }
+  return OffsetIndexAndNewOffsetT{maybeOffsetIdx, offset};
+}
+
+/// Split a logical objectFifo on the provided split dimension with the
+/// specified splitting factor.
+LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
+                                     AMDAIE::LogicalObjectFifoFromMemrefOp op,
+                                     size_t splitDim,
+                                     std::optional<size_t> maybeSplitFactor) {
+  SmallVector<int64_t> memrefShape =
+      llvm::to_vector(op.getMemrefType().getShape());
+  int64_t splitFactor = maybeSplitFactor.has_value() ? maybeSplitFactor.value()
+                                                : memrefShape[splitDim];
+  assert(
+      memrefShape[splitDim] % splitFactor == 0 &&
+      "the target size for splitting is not divisible by the splitting factor");
+  memrefShape[splitDim] /= splitFactor;
+
+  // Create `splitFactor` number of objectFifo ops.
+  SmallVector<AMDAIE::LogicalObjectFifoFromMemrefOp> newObjFifos;
+  newObjFifos.reserve(splitFactor);
+  for (int i = 0; i < splitFactor; i++) {
+    newObjFifos.push_back(
+        createNewLogicalObjectFifo(rewriter, op, memrefShape));
+  }
+
+  // Get the producers and consumers of the current objectFifoOp.
+  SmallVector<AMDAIE::DmaCpyNdOp> producers;
+  SmallVector<AMDAIE::DmaCpyNdOp> consumers;
+  if (failed(getDmaCpyNdOpProducersAndConsumers(op, producers, consumers))) {
+    return failure();
+  }
+
+  // The split offset is the
+  int64_t sizeAfterSplit =
+      std::accumulate(memrefShape.begin() + splitDim + 1, memrefShape.end(), 1,
+                      std::multiplies<>());
+  // Update the producer dma ops.
+  for (AMDAIE::DmaCpyNdOp producer : producers) {
+    SmallVector<OpFoldResult> targetOffsets = producer.getTargetMixedOffsets();
+    SmallVector<OpFoldResult> targetSizes = producer.getTargetMixedSizes();
+    SmallVector<OpFoldResult> targetStrides = producer.getTargetMixedStrides();
+    std::optional<size_t> maybeOffsetIdx;
+    int64_t targetOffset{0};
+    FailureOr<OffsetIndexAndNewOffsetT> maybeOffsetIdxAndNewOffset =
+        getOffsetIndexAndOffset(targetOffsets, targetSizes, targetStrides,
+                                sizeAfterSplit,
+                                [&]() { return producer.emitOpError(); });
+    if (failed(maybeOffsetIdxAndNewOffset)) {
+      return producer.emitOpError()
+             << "failed to find an offset index and new offset";
+    }
+    std::tie(maybeOffsetIdx, targetOffset) = maybeOffsetIdxAndNewOffset.value();
+    assert(targetOffset < newObjFifos.size() &&
+           "the targetOffset should be smaller than the number of objectFifos");
+    if (maybeOffsetIdx.has_value())
+      targetOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(0);
+    AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo =
+        newObjFifos[targetOffset];
+    rewriter.setInsertionPoint(producer);
+    auto newDmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+        producer.getLoc(), newObjFifo, targetOffsets, targetSizes,
+        targetStrides, producer.getSource(), producer.getSourceMixedOffsets(),
+        producer.getSourceMixedSizes(), producer.getSourceMixedStrides());
+    rewriter.replaceOp(producer, newDmaOp);
+  }
+
+  // Update the consumer dma ops.
+  for (AMDAIE::DmaCpyNdOp consumer : consumers) {
+    SmallVector<OpFoldResult> sourceOffsets = consumer.getSourceMixedOffsets();
+    SmallVector<OpFoldResult> sourceSizes = consumer.getSourceMixedSizes();
+    SmallVector<OpFoldResult> sourceStrides = consumer.getSourceMixedStrides();
+    std::optional<size_t> maybeOffsetIdx;
+    int64_t sourceOffset{0};
+    FailureOr<OffsetIndexAndNewOffsetT> maybeOffsetIdxAndNewOffset =
+        getOffsetIndexAndOffset(sourceOffsets, sourceSizes, sourceStrides,
+                                sizeAfterSplit,
+                                [&]() { return consumer.emitOpError(); });
+    if (failed(maybeOffsetIdxAndNewOffset)) {
+      return consumer.emitOpError()
+             << "failed to find an offset index and offset";
+    }
+    std::tie(maybeOffsetIdx, sourceOffset) = maybeOffsetIdxAndNewOffset.value();
+    assert(sourceOffset < newObjFifos.size() &&
+           "the sourceOffset should be smaller than the number of objectFifos");
+    if (maybeOffsetIdx.has_value())
+      sourceOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(0);
+    AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo =
+        newObjFifos[sourceOffset];
+    rewriter.setInsertionPoint(consumer);
+    auto newDmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+        consumer.getLoc(), consumer.getTarget(),
+        consumer.getTargetMixedOffsets(), consumer.getTargetMixedSizes(),
+        consumer.getTargetMixedStrides(), newObjFifo, sourceOffsets,
+        sourceSizes, sourceStrides);
+    rewriter.replaceOp(consumer, newDmaOp);
+  }
+  return success();
+}
+
+/// Split doubly strided operations on a source and target split dimension with
+/// the provided split factor.
+LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
+                                   AMDAIE::DoublyStridedOpInterface op,
+                                   size_t sourceSplitDim, size_t targetSplitDim,
+                                   std::optional<size_t> maybeSplitFactor) {
+  if (!op->use_empty())
+    return op.emitOpError() << "can't be split because it has uses";
+  SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
+  SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
+  SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
+  SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
+  SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
+  SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
+  assert(sourceSplitDim < sourceOffsets.size() &&
+         "the dimension to be split on should be smaller than the number of "
+         "source dimensions");
+  assert(targetSplitDim < targetOffsets.size() &&
+         "the dimension to be split on should be smaller than the number of "
+         "target dimensions");
+  std::optional<int64_t> sourceSize =
+      getConstantIntValue(sourceSizes[sourceSplitDim]);
+  std::optional<int64_t> targetSize =
+      getConstantIntValue(targetSizes[targetSplitDim]);
+  if (!sourceSize) {
+    return op.emitOpError()
+           << "does not have a static source size on dim: " << sourceSplitDim;
+  }
+  if (!targetSize) {
+    return op.emitOpError()
+           << "does not have a static target size on dim: " << targetSplitDim;
+  }
+  int64_t splitFactor = maybeSplitFactor.has_value()
+                            ? maybeSplitFactor.value()
+                            : std::gcd(sourceSize.value(), targetSize.value());
+  if (sourceSize.value() % splitFactor != 0 ||
+      targetSize.value() % splitFactor != 0) {
+    return op.emitOpError() << "the target or source size is not divisible by "
+                               "the provided splitting factor: "
+                            << splitFactor;
+  }
+  int64_t newSourceSize = sourceSize.value() / splitFactor;
+  int64_t newTargetSize = targetSize.value() / splitFactor;
+  sourceSizes[sourceSplitDim] = rewriter.getIndexAttr(newSourceSize);
+  targetSizes[targetSplitDim] = rewriter.getIndexAttr(newTargetSize);
+  rewriter.setInsertionPoint(op);
+  for (int i = 0; i < splitFactor; ++i) {
+    FailureOr<OpFoldResult> newSourceOffset = addToOffset(
+        rewriter, sourceOffsets[sourceSplitDim], newSourceSize);  // i *
+    FailureOr<OpFoldResult> newTargetOffset = addToOffset(
+        rewriter, targetOffsets[targetSplitDim], newTargetSize);  // i *
+    if (failed(newSourceOffset))
+      return op.emitOpError() << "could not create a new source offset";
+    if (failed(newTargetOffset))
+      return op.emitOpError() << "could not create a new target offset";
+    op.createDoublyStridedOp(rewriter, targetOffsets, targetSizes,
+                             targetStrides, sourceOffsets, sourceSizes,
+                             sourceStrides);
+    sourceOffsets[sourceSplitDim] = newSourceOffset.value();
+    targetOffsets[targetSplitDim] = newTargetOffset.value();
+  }
+  rewriter.eraseOp(op);
+  return success();
+}
+
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
index f9339b2ac..c470d917b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h
@@ -11,14 +11,26 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
-/// Utility to help fetch those input DmaCpyNd Ops which needs to be split.
-SmallVector<AMDAIE::DmaCpyNdOp> fetchDmaCpyNdOpsToSplitOrCombine(Operation *op);
-
 /// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops.
-LogicalResult splitLogicalObjectFifos(
+LogicalResult splitLogicalObjectFifoForElementwiseOp(
     IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
     MLIRContext *context);
 
+/// Split a logical objectFifo on the provided split dimension with the
+/// specified splitting factor. If no split factor is provided, the logical
+/// objectFifo will be split on the size of the dimension being split.
+LogicalResult splitLogicalObjectFifo(
+    IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromMemrefOp op,
+    size_t splitDim = 0, std::optional<size_t> splitFactor = std::nullopt);
+
+/// Split doubly strided operations on a source and target split dimension with
+/// the provided split factor. If no split factor is provided, the doubly
+/// strided operation will be split on the size of the dimension being split.
+LogicalResult splitDoublyStridedOp(
+    IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
+    size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
+    std::optional<size_t> splitFactor = std::nullopt);
+
 }  // namespace mlir::iree_compiler::AMDAIE
 
 #endif
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
index db00f723b..a0f40c369 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -405,7 +405,7 @@ LogicalResult AIEDeviceBuilder::bufferToAIE(AMDAIE::BufferOp bufferOp,
                                             Block *deviceBlock, int &bufferId) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::BufferOp]\n");
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToEnd(deviceBlock);
+  rewriter.setInsertionPoint(deviceBlock->getTerminator());
   auto elemType = cast<MemRefType>(bufferOp.getType());
   Value tile = mapper.lookup(bufferOp.getTile());
   auto aieBufferOp = rewriter.create<AIE::BufferOp>(
@@ -431,7 +431,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE(
     AMDAIE::ConnectionOp connectionOp, Block *deviceBlock,
     int &connectionIndex) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n");
-  rewriter.setInsertionPointToEnd(deviceBlock);
+  rewriter.setInsertionPoint(deviceBlock->getTerminator());
   SmallVector<AMDAIE::ChannelOp> producerChannels;
   SmallVector<AMDAIE::ChannelOp> consumerChannels;
   for (Value producerChannel : connectionOp.getSourceChannels()) {
@@ -543,7 +543,8 @@ LogicalResult AIEDeviceBuilder::connectionToAIE(
       }
       std::pair<AIE::LockOp, AIE::LockOp> lockPair =
           std::make_pair(consumerLocks[0], producerLocks[0]);
-      rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end());
+      rewriter.moveOpBefore(memOp, deviceBlock,
+                            deviceBlock->without_terminator().end());
       createDMA(memOp, AIE::DMAChannelDir::MM2S, channel.getValue(), dims,
                 acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers,
                 lockPair, packetId);
@@ -631,7 +632,8 @@ LogicalResult AIEDeviceBuilder::connectionToAIE(
       }
       std::pair<AIE::LockOp, AIE::LockOp> lockPair =
           std::make_pair(producerLocks[0], consumerLocks[0]);
-      rewriter.moveOpBefore(memOp, deviceBlock, deviceBlock->end());
+      rewriter.moveOpBefore(memOp, deviceBlock,
+                            deviceBlock->without_terminator().end());
       createDMA(memOp, AIE::DMAChannelDir::S2MM, channel.getValue(), dims,
                 acqNum, acqNum, maybeSize.value(), maybeOffset.value(), buffers,
                 lockPair, packetId);
@@ -649,7 +651,7 @@ LogicalResult AIEDeviceBuilder::connectionToAIE(
 LogicalResult AIEDeviceBuilder::flowToAIE(AMDAIE::FlowOp flowOp,
                                           Block *deviceBlock) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ConnectionOp]\n");
-  rewriter.setInsertionPointToEnd(deviceBlock);
+  rewriter.setInsertionPoint(deviceBlock->getTerminator());
   SmallVector<AMDAIE::ChannelOp> producerChannels;
   SmallVector<AMDAIE::ChannelOp> consumerChannels;
   for (Value producerChannel : flowOp.getSources()) {
@@ -671,7 +673,7 @@ LogicalResult AIEDeviceBuilder::flowToAIE(AMDAIE::FlowOp flowOp,
     consumerChannels.push_back(channelOp);
   }
   // Insert flow ops.
-  rewriter.setInsertionPointToEnd(deviceBlock);
+  rewriter.setInsertionPoint(deviceBlock->getTerminator());
   SmallVector<Operation *> flowOps =
       createFlowOps(flowOp, producerChannels, consumerChannels);
   return success();
@@ -681,7 +683,7 @@ LogicalResult AIEDeviceBuilder::lockToAIE(AMDAIE::LockOp lockOp,
                                           Block *deviceBlock, int &lockIndex) {
   LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LockOp]\n");
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToEnd(deviceBlock);
+  rewriter.setInsertionPoint(deviceBlock->getTerminator());
   Value tile = mapper.lookup(lockOp.getTile());
   auto aieLockOp = rewriter.create<AIE::LockOp>(
       lockOp.getLoc(), tile, lockOp.getValueAttr(), lockOp.getInitValueAttr(),
@@ -712,7 +714,7 @@ LogicalResult logicalObjFifoFromBuffersToMemOp(
   for (Value tile : logicalObjFifo.getTiles()) {
     if (tileToMemOpMap.contains(tile)) continue;
     Value aieTile = mapper.lookup(tile);
-    rewriter.setInsertionPointToEnd(deviceBlock);
+    rewriter.setInsertionPoint(deviceBlock->getTerminator());
     auto newMemOp = rewriter.create<MemOp>(rewriter.getUnknownLoc(), aieTile);
     rewriter.setInsertionPointToStart(&newMemOp.getRegion().emplaceBlock());
     rewriter.create<AIE::EndOp>(rewriter.getUnknownLoc());
@@ -855,7 +857,7 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp,
           return WalkResult::advance();
         })
         .Default([&](Operation *op) {
-          rewriter.setInsertionPointToEnd(deviceBlock);
+          rewriter.setInsertionPoint(deviceBlock->getTerminator());
           if (!isa_and_present<AMDAIEDialect>(op->getDialect())) {
             rewriter.clone(*op, mapper);
           } else {
@@ -868,7 +870,8 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp,
   if (res.wasInterrupted()) return failure();
 
   // Merge core operations into end of the device block
-  rewriter.mergeBlocks(deviceCoreBlock, deviceBlock);
+  rewriter.inlineBlockBefore(deviceCoreBlock, deviceBlock,
+                             deviceBlock->without_terminator().end());
   return success();
 }
 
@@ -902,7 +905,9 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
     auto deviceOp = rewriter.create<xilinx::AIE::DeviceOp>(
         rewriter.getUnknownLoc(),
         xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), aieDevice));
-    Block *deviceBlock = &deviceOp.getRegion().emplaceBlock();
+    xilinx::AIE::DeviceOp::ensureTerminator(deviceOp.getRegion(), rewriter,
+                                            deviceOp.getLoc());
+    Block *deviceBlock = deviceOp.getBody();
     rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin());
 
     // Create aiex.runtime_sequence inside aie.device
@@ -954,7 +959,8 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
     }
 
     // Move NPU instruction function to the end of the device block.
-    rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end());
+    rewriter.moveOpBefore(npuFuncOp, deviceBlock,
+                          deviceBlock->without_terminator().end());
     // After walking the FuncOp, it has been converted into a DeviceOp and can
     // safely be erased.
     eraseOp(funcOp);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp
index 0a6118ab3..6cd8d86b4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPeelForLoop.cpp
@@ -4,10 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/IR/Iterators.h"
 #include "mlir/Pass/Pass.h"
 
 #define DEBUG_TYPE "iree-amdaie-peel-for-loop"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
new file mode 100644
index 000000000..83e29bffe
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -0,0 +1,222 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+namespace {
+
+/// Utility struct to represent DMA split information.
+struct DmaSplitInfo {
+  size_t sourceSplitDim{0};
+  size_t targetSplitDim{0};
+};
+
+using DmaObjFifoPairT =
+    std::pair<AMDAIE::DmaCpyNdOp, AMDAIE::LogicalObjectFifoFromMemrefOp>;
+
+/// Find the logical objectFifo and DMA source/target splitting dimensions for
+/// each DMA and objectFifo pair.
+///
+/// Each pair is handled in the following way:
+/// First, compute the objectFifo splitting dimension as the last non-unit shape
+/// dimension. Afterwards, depending on which logical objectFifo is being
+/// split on, find the outermost dimension in either the source or
+/// target access pattern that has:
+/// - stride == sizeAfterSplit
+/// - size != 1
+/// This is the splitting dimension to be used on the respective side of the DMA
+/// operation. Then, calculate the product size of that side of the DMA
+/// operation after the splitting dimension and use it to calculate the
+/// splitting dimension on the other side as the first dimension from the back
+/// that has product size larger than the other side's product size after
+/// splitting because that's the number of elements that should be
+/// produced/consumed on the respective sides before splitting.
+LogicalResult collectSplittingDims(
+    const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
+    DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
+    DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, size_t>
+        &objFifoSplitDimMap) {
+  for (auto [dmaOp, objFifo] : dmaObjFifoPairs) {
+    LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n");
+    ArrayRef<int64_t> memrefShape = objFifo.getMemrefType().getShape();
+    if (llvm::any_of(memrefShape, [](int64_t size) {
+          return ShapedType::isDynamic(size);
+        })) {
+      return objFifo.emitOpError()
+             << "can't find a valid split dimension for dynamic sizes memref";
+    }
+    auto iter = std::find_if(memrefShape.begin(), memrefShape.end(),
+                             [](int64_t size) { return size > 1; });
+    size_t objFifoSplitDim = std::distance(memrefShape.begin(), iter);
+    // If all dimensions are unit (1), no splitting can be done, so continue to
+    // the next pair.
+    if (objFifoSplitDim >= memrefShape.size()) continue;
+    int64_t sizeAfterSplit =
+        std::accumulate(memrefShape.begin() + objFifoSplitDim + 1,
+                        memrefShape.end(), 1, std::multiplies<>());
+
+    size_t sourceSplitDim{0};
+    size_t targetSplitDim{0};
+    if (dmaOp.getTargetObjectFifo() == objFifo) {
+      std::optional<SmallVector<int64_t>> targetSizes =
+          getConstantIntValues(dmaOp.getTargetMixedSizes());
+      std::optional<SmallVector<int64_t>> targetStrides =
+          getConstantIntValues(dmaOp.getTargetMixedStrides());
+      std::optional<SmallVector<int64_t>> sourceSizes =
+          getConstantIntValues(dmaOp.getSourceMixedSizes());
+      if (!targetSizes.has_value() || !targetStrides.has_value() ||
+          !sourceSizes.has_value()) {
+        return dmaOp.emitOpError() << "has unsupported dynamic target strides "
+                                      "or sizes or source sizes";
+      }
+      for (auto iter : llvm::enumerate(
+               llvm::zip(targetSizes.value(), targetStrides.value()))) {
+        int64_t size = std::get<0>(iter.value());
+        int64_t stride = std::get<1>(iter.value());
+        if (stride == sizeAfterSplit && size != 1) {
+          targetSplitDim = iter.index();
+          break;
+        }
+      }
+      int64_t targetSizeAfterSplit =
+          std::accumulate(targetSizes.value().begin() + targetSplitDim + 1,
+                          targetSizes.value().end(), 1, std::multiplies<>());
+      SmallVector<int64_t> sourceProductSizes = sourceSizes.value();
+      std::partial_sum(sourceProductSizes.rbegin(), sourceProductSizes.rend(),
+                       sourceProductSizes.rbegin(), std::multiplies<int64_t>());
+      for (int idx = sourceProductSizes.size() - 1; idx > 0; idx--) {
+        if (sourceProductSizes[idx] > targetSizeAfterSplit) {
+          sourceSplitDim = idx;
+          break;
+        }
+      }
+    } else if (dmaOp.getSourceObjectFifo() == objFifo) {
+      // Find outermost dimension in the access pattern that has stride ==
+      // sizeAfterSplit and size != 1.
+      std::optional<SmallVector<int64_t>> sourceSizes =
+          getConstantIntValues(dmaOp.getSourceMixedSizes());
+      std::optional<SmallVector<int64_t>> sourceStrides =
+          getConstantIntValues(dmaOp.getSourceMixedStrides());
+      std::optional<SmallVector<int64_t>> targetSizes =
+          getConstantIntValues(dmaOp.getTargetMixedSizes());
+      if (!sourceSizes.has_value() || !sourceStrides.has_value() ||
+          !targetSizes.has_value()) {
+        return dmaOp.emitOpError() << "has unsupported dynamic source strides "
+                                      "or sizes or target sizes";
+      }
+      for (auto iter : llvm::enumerate(
+               llvm::zip(sourceSizes.value(), sourceStrides.value()))) {
+        int64_t size = std::get<0>(iter.value());
+        int64_t stride = std::get<1>(iter.value());
+        if (stride == sizeAfterSplit && size != 1) {
+          sourceSplitDim = iter.index();
+          break;
+        }
+      }
+      int64_t sourceRemainderSize =
+          std::accumulate(sourceSizes.value().begin() + sourceSplitDim + 1,
+                          sourceSizes.value().end(), 1, std::multiplies<>());
+      SmallVector<int64_t> targetProductSizes = targetSizes.value();
+      std::partial_sum(targetProductSizes.rbegin(), targetProductSizes.rend(),
+                       targetProductSizes.rbegin(), std::multiplies<int64_t>());
+      for (int idx = targetProductSizes.size() - 1; idx > 0; idx--) {
+        if (targetProductSizes[idx] > sourceRemainderSize) {
+          targetSplitDim = idx;
+          break;
+        }
+      }
+    }
+    LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n");
+    DmaSplitInfo dmaSplitInfo = {sourceSplitDim, targetSplitDim};
+    dmaSplitInfoMap[dmaOp] = std::move(dmaSplitInfo);
+    objFifoSplitDimMap[objFifo] = objFifoSplitDim;
+  }
+  return success();
+}
+
+class AMDAIESplitLogicalObjFifosPass
+    : public impl::AMDAIESplitLogicalObjFifosBase<
+          AMDAIESplitLogicalObjFifosPass> {
+ public:
+  using AMDAIESplitLogicalObjFifosBase::AMDAIESplitLogicalObjFifosBase;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
+  ModuleOp moduleOp = getOperation();
+  MLIRContext *context = &getContext();
+  IRRewriter rewriter(context);
+
+  // Walk and collect all dma ops between L3 and L2.
+  SmallVector<AMDAIE::DmaCpyNdOp> l3L2DmaOps;
+  SmallVector<DmaObjFifoPairT> dmaObjFifoPairs;
+  WalkResult res = moduleOp->walk([&](AMDAIE::DmaCpyNdOp op) {
+    std::optional<uint8_t> sourceMemSpace = op.getSourceMemorySpaceAsUInt();
+    std::optional<uint8_t> targetMemSpace = op.getTargetMemorySpaceAsUInt();
+    if (!sourceMemSpace || !targetMemSpace) {
+      op.emitOpError() << "expected a source and target memory space";
+      return WalkResult::interrupt();
+    }
+    if (sourceMemSpace.value() == 1 && targetMemSpace.value() == 0) {
+      dmaObjFifoPairs.push_back({op, op.getSourceObjectFifo()});
+    } else if (sourceMemSpace.value() == 0 && targetMemSpace.value() == 1) {
+      dmaObjFifoPairs.push_back({op, op.getTargetObjectFifo()});
+    }
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return signalPassFailure();
+
+  // Collect the split dimensions for all DMA and ojectFifo pairs.
+  DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> dmaSplitInfoMap;
+  DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, size_t> objFifoSplitDimMap;
+  if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap,
+                                  objFifoSplitDimMap))) {
+    return signalPassFailure();
+  }
+
+  /// Split the DMA and objectFifo ops based on the calcuated splitting
+  /// dimensions.
+  for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) {
+    auto stridedOp =
+        cast<AMDAIE::DoublyStridedOpInterface>(dmaOp.getOperation());
+    if (failed(splitDoublyStridedOp(rewriter, stridedOp,
+                                    dmaSplitInfo.sourceSplitDim,
+                                    dmaSplitInfo.targetSplitDim))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to perform splitting of the DMA op: " << dmaOp);
+      return signalPassFailure();
+    }
+  }
+  for (auto &&[objFifo, splitDim] : objFifoSplitDimMap) {
+    if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitDim))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to perform splitting of objectFifo op");
+      return signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosPass() {
+  return std::make_unique<AMDAIESplitLogicalObjFifosPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
index 4839246a4..f4ac4f9fd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifosForConnectionReuse.cpp
@@ -6,8 +6,6 @@
 #include "iree-amd-aie/IR/AMDAIEOps.h"
 #include "iree-amd-aie/Transforms/AMDAIELogicalObjFifoSplittingUtils.h"
 #include "iree-amd-aie/Transforms/Passes.h"
-// #include "llvm/Support/Debug.h"
-#include "mlir/IR/Iterators.h"
 #include "mlir/Pass/Pass.h"
 
 #define DEBUG_TYPE "iree-amdaie-split-logical-objectfifos-for-connection-reuse"
@@ -34,10 +32,24 @@ void AMDAIESplitLogicalObjFifosForConnectionReusePass::runOnOperation() {
   MLIRContext *context = &getContext();
   IRRewriter rewriter(context);
 
-  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps =
-      fetchDmaCpyNdOpsToSplitOrCombine(moduleOp);
+  // Walk through CoreOps gathering 3rd input DmaOps (if applicable) which will
+  // be used to split L2 objectFifos of elementwise input for connection reuse.
+  SmallVector<AMDAIE::DmaCpyNdOp> l2ToL1DmaOps;
+  WalkResult res = moduleOp->walk([&](AMDAIE::CoreOp coreOp) {
+    SmallVector<Value> inputDmas = coreOp.getInputDmas();
+    if (inputDmas.size() != 3) return WalkResult::skip();
+    auto dmaCpyNdOp = inputDmas[2].getDefiningOp<AMDAIE::DmaCpyNdOp>();
+    if (!dmaCpyNdOp) {
+      coreOp->emitOpError() << "failed to get a DmaCpyNdOp from the input";
+      return WalkResult::interrupt();
+    }
+    l2ToL1DmaOps.push_back(dmaCpyNdOp);
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return signalPassFailure();
 
-  if (failed(splitLogicalObjectFifos(rewriter, l2ToL1DmaOps, context))) {
+  if (failed(splitLogicalObjectFifoForElementwiseOp(rewriter, l2ToL1DmaOps,
+                                                    context))) {
     LLVM_DEBUG(llvm::dbgs()
                << "Failed to perform splitting of logicalobjectfifos");
     return signalPassFailure();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp
index ae6081349..2cd57beb2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp
@@ -6,10 +6,10 @@
 
 #include "AMDAIEUtils.h"
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Iterators.h"
 
 namespace mlir::iree_compiler::AMDAIE {
 
@@ -251,7 +251,7 @@ bool isMatmulInDefChain(Value operand) {
 /// Utility to identify if `linalgOp` is an elementwise operation with a
 /// matmul-like op upstream in its computation tree.
 bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp) {
-  if (!isElementwise(linalgOp) || isa<linalg::FillOp>(linalgOp)) {
+  if (!linalg::isElementwise(linalgOp) || isa<linalg::FillOp>(linalgOp)) {
     return false;
   }
   // Check if any of the defining op is a matmul-like op.
@@ -272,6 +272,13 @@ std::string utohexstr(uint32_t value, size_t width, bool header,
   return res + prefix + hexStr;
 }
 
+/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor.
+Operation *getAncestorInBlock(Operation *op, Block *block) {
+  if (!op || !block) return nullptr;
+  while (op && (op->getBlock() != block)) op = op->getParentOp();
+  return op;
+}
+
 /// Find the largest factor of 'num' which is not larger than 'max'.
 int detail::findLargestFactor(int num, int max) {
   assert(max > 0 && "No factors less than or equal to 0 exist");
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h
index e78ebc281..3daac3cd7 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h
@@ -7,11 +7,9 @@
 #ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIEUTILS_H_
 #define IREE_AMD_AIE_TRANSFORMS_AMDAIEUTILS_H_
 
-#include <array>
-
-#include "iree-amd-aie/IR/AMDAIEAttrs.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "iree-amd-aie/aie_runtime/AMDAIEEnums.h"
+#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/IR/Types.h"
 
 namespace mlir::iree_compiler::AMDAIE {
@@ -62,15 +60,22 @@ bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp);
 std::string utohexstr(uint32_t value, size_t width, bool header = true,
                       bool lowercase = false);
 
+/// If `op` is in `block`, then return `op`. Otherwise traverse through parents
+/// to the first ancestor of `op` that is in `block`, and return that
+/// ancestor. If `op` has no ancestor in `block`, or if `op` is nullptr or
+/// `block` is nullptr, return nullptr.
+Operation *getAncestorInBlock(Operation *op, Block *block);
+
 namespace detail {
 
-// Returns the largest number that perfectly divides `num` that
-// is less than or equal to max
+/// Returns the largest number that perfectly divides `num` that
+/// is less than or equal to max
 int findLargestFactor(int num, int max);
 
-// A variant where we prefer factors to also be a multiple of `multiple`
+/// A variant where we prefer factors to also be a multiple of `multiple`
 int findLargestFactor(int num, int max, int multiple);
 
+
 }  // namespace detail
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index c466003aa..e56c949d3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -52,12 +52,14 @@ iree_cc_library(
     "AMDAIEAssignLogicalObjectFifoDepth.cpp"
     "AMDAIEAssignNpuDmaBdIds.cpp"
     "AMDAIEAssignPacketIds.cpp"
+    "AMDAIEAssignTiles.cpp"
     "AMDAIEBufferizeToAllocation.cpp"
     "AMDAIECanonicalizeNpuDmaCpyNd.cpp"
     "AMDAIECanonicalizeDoublyStridedOp.cpp"
     "AMDAIECombineStridedOps.cpp"
     "AMDAIEConnectionToFlow.cpp"
     "AMDAIEConvertToDma.cpp"
+    "AMDAIEControlCodeForallToFor.cpp"
     "AMDAIEControlCodeLowering.cpp"
     "AMDAIEControlCodeLoopUnroll.cpp"
     "AMDAIEControlCodeToTransaction.cpp"
@@ -101,6 +103,7 @@ iree_cc_library(
     "AMDAIEPropagateDataLayout.cpp"
     "AMDAIERemoveMemorySpace.cpp"
     "AMDAIESinkIntoCore.cpp"
+    "AMDAIESplitLogicalObjFifos.cpp"
     "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp"
     "AMDAIETemporaryAllocBufferization.cpp"
     "AMDAIETile.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp
index 9f937b8b4..4d92fb59c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/DecomposeLinalgExtPackUnPackToAIR.cpp
@@ -241,10 +241,11 @@ FailureOr<LowerPackUnPackResult> lowerUnPack(
     auto tileShape = srcShape.drop_front(destRank);
     // Append the inner tile shape to the permuted and rank-reduced outer shape.
     readShape.append(tileShape.begin(), tileShape.end());
-    Type elemType = unPackOp.getInputType().getElementType();
-    Attribute memorySpace =
-        cast<MemRefType>(unPackOp.getInputType()).getMemorySpace();
-    auto readType = MemRefType::get(readShape, elemType, nullptr, memorySpace);
+    MemRefType inputType = cast<MemRefType>(unPackOp.getInputType());
+    auto readType =
+        cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
+            readShape, inputType, readOffsets, readSizes, readStrides));
+
     tile = rewriter.create<memref::SubViewOp>(loc, readType, input, readOffsets,
                                               readSizes, readStrides);
     perm = getPackUnpackNormalizedPerm(readType.getRank(), innerDimsPos);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 47cbc5dff..232a7b874 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -27,6 +27,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEASSIGNLOGICALOBJECTFIFODEPTH
 #define GEN_PASS_DEF_AMDAIEASSIGNNPUDMABDIDS
 #define GEN_PASS_DEF_AMDAIEASSIGNPACKETIDS
+#define GEN_PASS_DEF_AMDAIEASSIGNTILES
 #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR
 #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION
 #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP
@@ -34,6 +35,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECLEANUP
 #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS
 #define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW
+#define GEN_PASS_DEF_AMDAIECONTROLCODEFORALLTOFOR
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOWERING
 #define GEN_PASS_DEF_AMDAIECONTROLCODETOTRANSACTION
@@ -80,6 +82,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT
 #define GEN_PASS_DEF_AMDAIEREMOVEMEMORYSPACE
 #define GEN_PASS_DEF_AMDAIESINKINTOCORE
+#define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOS
 #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE
 #define GEN_PASS_DEF_AMDAIETEMPORARYALLOCBUFFERIZATION
 #define GEN_PASS_DEF_AMDAIETILE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 21929a568..98297208e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -602,6 +602,13 @@ void addAMDAIEObjectFifoLoweringPasses(
   passManager.addPass(createCanonicalizerPass());
 
   passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass());
+  // Currently, SplitLogicalObjFifos pass only works for matmul-like ops.
+  if (useTilePipeline == TilePassPipeline::PackPeelPipeline)
+    passManager.addPass(createAMDAIESplitLogicalObjFifosPass());
+  passManager.addPass(createCSEPass());
+  passManager.addPass(createCanonicalizerPass());
+
+  passManager.addPass(createAMDAIEAssignTilesPass());
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
 
@@ -622,6 +629,12 @@ void addAMDAIEObjectFifoLoweringPasses(
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
 
+  // Convert control code `scf.forall` ops to `scf.for` ops right before the DMA
+  // composition optimization pass to enable more loop subsumption optimization
+  // opportunities.
+  passManager.addPass(createAMDAIEControlCodeForallToForPass());
+  passManager.addPass(createCSEPass());
+  passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIEDmaCompositionPass());
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index aab36fd7c..3020dc969 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -93,6 +93,9 @@ std::unique_ptr<Pass> createAMDAIEAssignNpuDmaBdIdsPass();
 /// Create a pass to assign packet ids to `amdaie.flow` operations.
 std::unique_ptr<Pass> createAMDAIEAssignPacketIdsPass();
 
+/// Create a pass to assign physical tile locations to logical objFifos.
+std::unique_ptr<Pass> createAMDAIEAssignTilesPass();
+
 /// Create a pass to do some rewrites that help bridging the path to AIR/AIE
 /// lowering.
 std::unique_ptr<Pass> createAMDAIEBridgeToAIRPass();
@@ -112,6 +115,9 @@ std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(
 /// Create pass to create `amdaie.flow` ops for connections.
 std::unique_ptr<Pass> createAMDAIEConnectionToFlowPass();
 
+/// Pass to convert `scf.forall` to `scf.for` within `amdaie.controlcode`.
+std::unique_ptr<Pass> createAMDAIEControlCodeForallToForPass();
+
 /// Pass to unroll the loops within the control code regions.
 std::unique_ptr<Pass> createAMDAIEControlCodeLoopUnrollPass();
 
@@ -276,6 +282,9 @@ std::unique_ptr<Pass> createAMDAIERemoveMemorySpacePass();
 /// Create a pass to sink all dependencies into `amdaie.core` operations.
 std::unique_ptr<Pass> createAMDAIESinkIntoCorePass();
 
+/// Create a pass to split logicalobjectfifos for shimTile/memTile distribution.
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosPass();
+
 /// Create a pass to split logicalobjectfifos for connection reuse.
 std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosForConnectionReusePass();
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 035cec0a1..9a4b2ce62 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -67,6 +67,12 @@ def AMDAIEAssignPacketIds :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignPacketIdsPass()";
 }
 
+def AMDAIEAssignTiles : Pass<"iree-amdaie-assign-tiles", ""> {
+  let summary = "Assign physical tile locations to logical objectFifos. "
+                "Existing assignments will be ignored/replaced.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAssignTilesPass()";
+}
+
 def AMDAIEBridgeToAIR : Pass<"iree-amdaie-bridge-to-air", ""> {
   let summary = "Perform transformations that allow hooking into AIR/AIE lowering";
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEBridgeToAIRPass()";
@@ -147,6 +153,12 @@ def AMDAIEConnectionToFlow :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConnectionToFlowPass()";
 }
 
+def AMDAIEControlCodeForallToFor :
+    Pass<"iree-amdaie-controlcode-forall-to-for", ""> {
+  let summary = "Converts `scf.forall` to `scf.for` within `amdaie.controlcode`.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeForallToForPass()";
+}
+
 def AMDAIEControlCodeLoopUnroll :
     Pass<"iree-amdaie-controlcode-loop-unroll", ""> {
   let summary = "Unroll the loops in the control code regions.";
@@ -667,6 +679,25 @@ def AMDAIESinkIntoCore :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()";
 }
 
+def AMDAIESplitLogicalObjFifos :
+  Pass<"iree-amdaie-split-logical-objectfifos", "ModuleOp"> {
+  let summary = "Pass to split L2 buffers to distribute on multiple shimTiles and memTiles.";
+  let description = [{
+    Splitting L2 input and output logical objectFifos and their user dma operations,
+    so that the logical objectFifos can be distributed on multiple shimTiles/memTiles.
+    Ideally, the split factor should only depend on the number of AIE columns being used,
+    however the current implementation only considers situations in which `nrows == ncols`.
+
+    For example, for the case of a matmul C = A x B, the two outermost dimensions
+    of the L2 buffers are the implications of `nrows x ncols` AIE cores being used.
+    So if, A matrix is distributed on a 2x2 AIE array, with L2 buffer size
+    `[2, 1, 32, 32]`, will be split to two `[1, 1, 32, 32]` buffers.
+    Similarly, B matrix is distributed on a 2x2 AIE array with L2 buffer size
+    `[1, 2, 32, 32]`, will be split to two `[1, 1, 32, 32]` buffers.
+  }];
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosPass()";
+}
+
 def AMDAIESplitLogicalObjFifosForConnectionReuse :
   Pass<"iree-amdaie-split-logical-objectfifos-for-connection-reuse", "ModuleOp"> {
   let summary = "Pass to split L2 buffers to share inputs of Matmul and Elementwise operations.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h
index 10d444584..ca56f4446 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h
@@ -14,6 +14,15 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
+/// Assign tile locations to the logical objectfifos with local memory space
+/// (L1).
+LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op);
+
+/// Assign tile locations to the logical objectfifos with non-local memory space
+/// (L2, L3 etc, not L1).
+LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op,
+                                  const AMDAIEDeviceModel &deviceModel);
+
 /// Unroll the loops within the control code regions.
 LogicalResult controlCodeLoopUnroll(RewriterBase &rewriter,
                                     AMDAIE::ControlCodeOp controlCodeOp);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 2e5a400cf..151812b33 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_lit_test_suite(
     "assign_logical_objectfifo_depth.mlir"
     "assign_npu_dma_bd_ids.mlir"
     "assign_packet_ids.mlir"
+    "assign_tiles.mlir"
     "bridge_to_air.mlir"
     "bufferize_to_allocation.mlir"
     "canonicalize_doubly_strided_op.mlir"
@@ -23,6 +24,7 @@ iree_lit_test_suite(
     "canonicalize_npu_dma_cpy_nd.mlir"
     "combine_strided_ops.mlir"
     "connection_to_flow.mlir"
+    "controlcode_forall_to_for.mlir"
     "controlcode_loop_unrolling.mlir"
     "controlcode_lowering.mlir"
     "controlcode_to_transaction.mlir"
@@ -73,6 +75,7 @@ iree_lit_test_suite(
     "propagate_data_layout.mlir"
     "remove_memory_space.mlir"
     "sink_into_core.mlir"
+    "split_logicalobjfifos.mlir"
     "split_logicalobjfifos_for_connection_reuse.mlir"
     "temporary_alloc_bufferization.mlir"
     "tile_and_fuse_using_scf_for.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
index 6ef0acc6b..d8d45da0f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir
@@ -119,13 +119,13 @@ func.func @read_and_write(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32,
 // CHECK:         %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[CONNECTION0]], Consume)
 // CHECK:         %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read)
 // CHECK:         linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_1]]
+// CHECK:         amdaie.logicalobjectfifo.release(%[[CONNECTION0]], Consume)
 // CHECK:         scf.for
-// CHECK:           amdaie.logicalobjectfifo.release(%[[CONNECTION0]], Consume)
 // CHECK:           %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[CONNECTION0]], Consume)
 // CHECK:           %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_1]]
+// CHECK:           amdaie.logicalobjectfifo.release(%[[CONNECTION0]], Consume)
 // CHECK:         }
-// CHECK:         amdaie.logicalobjectfifo.release(%[[CONNECTION0]], Consume)
 // CHECK:         %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[CONNECTION0]], Consume)
 // CHECK:         %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read)
 // CHECK:         %[[ACQUIRE_0:.+]] = amdaie.logicalobjectfifo.acquire(%[[CONNECTION1]], Produce)
@@ -160,6 +160,7 @@ func.func @read_write_multiple_blocks(%arg0: !amdaie.logicalobjectfifo<memref<1x
 
 // -----
 
+
 // Check deterministic order of multiple reads.
 // CHECK-LABEL: @multiple_reads_deterministic_order
 // CHECK:       %[[CONNECTION0:.+]] = amdaie.connection
@@ -219,3 +220,179 @@ func.func @multiple_writes_deterministic_order(%arg0: !amdaie.logicalobjectfifo<
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL: @loop_without_prologue
+// CHECK:       scf.for
+// CHECK-SAME:  {
+// CHECK:           acquire
+// CHECK:           access
+// CHECK:           linalg.fill
+// CHECK:           logicalobjectfifo.release
+// CHECK-SAME:      {size = 1 : i32}
+// CHECK-NEXT:   }
+// CHECK:        acquire
+// CHECK:        access
+// CHECK:        linalg.fill
+// CHECK:        logicalobjectfifo.release
+// CHECK-NEXT:   amdaie.end
+func.func @loop_without_prologue(%arg0: !amdaie.logicalobjectfifo<memref<32xi32, 2>>,
+                                 %arg1: !amdaie.logicalobjectfifo<memref<32xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %tile = amdaie.tile(%c0, %c0)
+  %c0_i32 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : index
+  %c32 = arith.constant 32 : index
+  %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<32xi32, 2>>, !amdaie.logicalobjectfifo<memref<32xi32, 1>>)
+  %core = amdaie.core(%tile, in : [%connection], out : []) {
+    scf.for %arg = %c0 to %c32 step %c1 {
+      %access1= amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+      linalg.fill ins(%c0_i32 : i32) outs(%access1 : memref<32xi32, 2>)
+    }
+    %access2 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+    linalg.fill ins(%c0_i32 : i32) outs(%access2 : memref<32xi32, 2>)
+    amdaie.end
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @loop_without_epilogue
+// CHECK:       amdaie.core
+// CHECK-NEXT:  acquire
+// CHECK-NEXT:  access
+// CHECK-NEXT:  linalg.fill
+// CHECK-NEXT:  logicalobjectfifo.release
+// CHECK-NEXT:  scf.for
+// CHECK-SAME:  {
+// CHECK-NEXT:     acquire
+// CHECK-NEXT:     access
+// CHECK-NEXT:     linalg.fill
+// CHECK-NEXT:     logicalobjectfifo.release
+// CHECK-SAME:     {size = 1 : i32}
+// CHECK-NEXT:  }
+func.func @loop_without_epilogue(%arg0: !amdaie.logicalobjectfifo<memref<32xi32, 2>>,
+                                 %arg1: !amdaie.logicalobjectfifo<memref<32xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %tile = amdaie.tile(%c0, %c0)
+  %c0_i32 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : index
+  %c32 = arith.constant 32 : index
+  %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<32xi32, 2>>, !amdaie.logicalobjectfifo<memref<32xi32, 1>>)
+  %core = amdaie.core(%tile, in : [%connection], out : []) {
+    %access1 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+    linalg.fill ins(%c0_i32 : i32) outs(%access1 : memref<32xi32, 2>)
+    scf.for %arg = %c0 to %c32 step %c1 {
+      %access2 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+      linalg.fill ins(%c0_i32 : i32) outs(%access2 : memref<32xi32, 2>)
+    }
+    amdaie.end
+  }
+  return
+}
+
+// -----
+
+// Test of the case
+//
+// for ... {
+//   access
+//   for ... {
+//     access
+//   }
+// }
+//
+// with expected result
+//
+// for ... {
+//   acquire
+//   release
+//   for ... {
+//     acquire
+//     release
+//   }
+// }
+
+// CHECK-LABEL: @nested_for_loops
+// CHECK:       amdaie.core
+// CHECK:       scf.for
+// CHECK:       acquire
+// CHECK:       access
+// CHECK:       linalg.fill
+// CHECK:       logicalobjectfifo.release
+// CHECK:       scf.for
+// CHECK:         acquire
+// CHECK:         access
+// CHECK:         linalg.fill
+// CHECK:         logicalobjectfifo.release
+// CHECK:       }
+// CHECK:     }
+// CHECK: amdaie.end
+func.func @nested_for_loops(%arg0: !amdaie.logicalobjectfifo<memref<32xi32, 2>>,
+                            %arg1: !amdaie.logicalobjectfifo<memref<32xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c8 = arith.constant 8 : index
+  %tile = amdaie.tile(%c0, %c0)
+  %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<32xi32, 2>>, !amdaie.logicalobjectfifo<memref<32xi32, 1>>)
+  %core = amdaie.core(%tile, in : [%connection], out : []) {
+    scf.for %arg_0 = %c0 to %c4 step %c1 {
+      %access2 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+      linalg.fill ins(%c0 : index) outs(%access2 : memref<32xi32, 2>)
+      scf.for %arg_1 = %c0 to %c8 step %c1 {
+        %access3 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+        linalg.fill ins(%c0 : index) outs(%access3 : memref<32xi32, 2>)
+      }
+    }
+    amdaie.end
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @epilogue_write_with_preceding_none_accesses
+// CHECK:       amdaie.core
+// CHECK-NEXT:  acquire
+// CHECK-SAME:  Produce
+// CHECK-NEXT:  access
+// CHECK-SAME:  Write
+// CHECK-NEXT:  linalg.fill
+// CHECK-NEXT:  scf.for
+// CHECK-SAME:  {
+// CHECK-NEXT:     linalg.fill
+// CHECK-NEXT:  }
+// CHECK-NEXT:  linalg.fill
+// CHECK-NEXT:  logicalobjectfifo.release
+// CHECK-SAME:  Produce
+// CHECK-NEXT:  amdaie.end
+
+// With the current implementation, the acquire for Write access is inserted
+// before the very first access of the objectfifo, which in this case is on a
+// None access.
+func.func @epilogue_write_with_preceding_none_accesses(
+               %arg0: !amdaie.logicalobjectfifo<memref<32xi32, 2>>,
+               %arg1: !amdaie.logicalobjectfifo<memref<32xi32, 1>>) {
+  %c0 = arith.constant 0 : index
+  %tile = amdaie.tile(%c0, %c0)
+  %c0_i32 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : index
+  %c32 = arith.constant 32 : index
+  %connection = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<32xi32, 2>>, !amdaie.logicalobjectfifo<memref<32xi32, 1>>)
+  %core = amdaie.core(%tile, in : [%connection], out : []) {
+    %access1 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+    linalg.fill ins(%c0_i32 : i32) outs(%access1 : memref<32xi32, 2>)
+    scf.for %arg = %c0 to %c32 step %c1 {
+      %access2 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+      linalg.fill ins(%c0_i32 : i32) outs(%access2 : memref<32xi32, 2>)
+    }
+    %access3 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo<memref<32xi32, 2>> -> memref<32xi32, 2>
+    linalg.fill ins(%c0_i32 : i32) outs(%access3 : memref<32xi32, 2>)
+    amdaie.end
+  }
+  return
+}
+
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir
new file mode 100644
index 000000000..10bd42978
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir
@@ -0,0 +1,360 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-assign-tiles,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}}
+module {
+  func.func @no_amdaie_device() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Test assignment of L1 objFifos based on the cores where they are used.
+// CHECK-LABEL: @assign_local_tiles
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 2>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @assign_local_tiles() {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %alloc = memref.alloc() : memref<1024xi32, 2>
+    %alloc_1 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %tile_0_3 = amdaie.tile(%c0, %c3)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %2 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %3 = amdaie.logicalobjectfifo.access(%0, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
+        %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      %5 = amdaie.core(%tile_0_3, in : [], out : []) {
+        %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<1024xi32, 2>
+    memref.dealloc %alloc_1 : memref<2048xi32, 2>
+    return
+  }
+}
+
+// -----
+
+// Test assignment of L2 objFifos based on L1 assignments.
+// CHECK-LABEL: @assign_l2_l1_tiles
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32, 1>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @assign_l2_l1_tiles() {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %alloc = memref.alloc() : memref<2048xi32, 1>
+    %alloc_1 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %tile_0_3 = amdaie.tile(%c0, %c3)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1>>)
+      %3 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      %5 = amdaie.core(%tile_0_3, in : [], out : []) {
+        %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<2048xi32, 1>
+    memref.dealloc %alloc_1 : memref<2048xi32, 2>
+    return
+  }
+}
+
+// -----
+
+// Test assignment of L2 objFifos onto different columns.
+// CHECK-LABEL: @assign_l2_tiles_on_diff_cols
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 1>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @assign_l2_tiles_on_diff_cols() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<1024xi32, 1>
+    %alloc_1 = memref.alloc() : memref<2048xi32, 1>
+    %alloc_2 = memref.alloc() : memref<1024xi32, 2>
+    %alloc_3 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %tile_1_2 = amdaie.tile(%c1, %c2)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1>>
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %4 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      %5 = amdaie.dma_cpy_nd(%3[] [] [], %2[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1>>)
+      %6 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %7 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
+        amdaie.end
+      }
+      %8 = amdaie.core(%tile_1_2, in : [], out : []) {
+        %9 = amdaie.logicalobjectfifo.access(%3, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<1024xi32, 1>
+    memref.dealloc %alloc_1 : memref<2048xi32, 1>
+    memref.dealloc %alloc_2 : memref<1024xi32, 2>
+    memref.dealloc %alloc_3 : memref<2048xi32, 2>
+    return
+  }
+}
+
+// -----
+
+// Test assignment of L3 and L2 objFifos based on L1 assignments.
+// CHECK-LABEL: @assign_l3_l2_l1_tiles
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1>
+// CHECK-DAG:   %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @assign_l3_l2_l1_tiles() {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<2048xi32>
+    %alloc_1 = memref.alloc() : memref<2048xi32, 1>
+    %alloc_2 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo<memref<2048xi32, 0>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %3 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1>>)
+      %4 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      %5 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %6 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<2048xi32>
+    memref.dealloc %alloc_1 : memref<2048xi32, 1>
+    memref.dealloc %alloc_2 : memref<2048xi32, 2>
+    return
+  }
+}
+
+// -----
+
+// Test assignment of L3 objFifos based on L1 assignments.
+// CHECK-LABEL: @assign_l3_l1_tiles
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @assign_l3_l1_tiles() {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<2048xi32>
+    %alloc_1 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo<memref<2048xi32, 0>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      %3 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %4 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<2048xi32>
+    memref.dealloc %alloc_1 : memref<2048xi32, 2>
+    return
+  }
+}
+
+// -----
+
+// Test assignment of L3 objFifos onto different columns.
+// CHECK-LABEL: @assign_l3_tiles_on_diff_cols
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<1024xi32>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_0]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @assign_l3_tiles_on_diff_cols() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<1024xi32>
+    %alloc_1 = memref.alloc() : memref<2048xi32>
+    %alloc_2 = memref.alloc() : memref<1024xi32, 1>
+    %alloc_3 = memref.alloc() : memref<2048xi32, 1>
+    %alloc_4 = memref.alloc() : memref<1024xi32, 2>
+    %alloc_5 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %tile_1_2 = amdaie.tile(%c1, %c2)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1024xi32> -> !amdaie.logicalobjectfifo<memref<1024xi32>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<1024xi32, 1>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1024xi32, 2> -> !amdaie.logicalobjectfifo<memref<1024xi32, 2>>
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2048xi32, 1> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1>>
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %6 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024xi32>>)
+      %7 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<1024xi32, 2>>, !amdaie.logicalobjectfifo<memref<1024xi32, 1>>)
+      %8 = amdaie.dma_cpy_nd(%4[] [] [], %3[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 1>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      %9 = amdaie.dma_cpy_nd(%5[] [] [], %4[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1>>)
+      %10 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %11 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo<memref<1024xi32, 2>> -> memref<1024xi32, 2>
+        amdaie.end
+      }
+      %12 = amdaie.core(%tile_1_2, in : [], out : []) {
+        %13 = amdaie.logicalobjectfifo.access(%5, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<1024xi32>
+    memref.dealloc %alloc_1 : memref<2048xi32>
+    memref.dealloc %alloc_2 : memref<1024xi32, 1>
+    memref.dealloc %alloc_3 : memref<2048xi32, 1>
+    memref.dealloc %alloc_4 : memref<1024xi32, 2>
+    memref.dealloc %alloc_5 : memref<2048xi32, 2>
+    return
+  }
+}
+
+// -----
+
+// Test duplicate global logical objectFifos (L3).
+// CHECK-LABEL: @duplicate_global_object_fifos
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2>
+// CHECK-DAG:   %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2>
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]}
+// CHECK-DAG:     amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]]}
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @duplicate_global_object_fifos() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<2048xi32>
+    %alloc_1 = memref.alloc() : memref<2048xi32, 2>
+    %alloc_2 = memref.alloc() : memref<2048xi32, 2>
+    amdaie.workgroup {
+      %tile_0_2 = amdaie.tile(%c0, %c2)
+      %tile_1_2 = amdaie.tile(%c1, %c2)
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<2048xi32, 0> -> !amdaie.logicalobjectfifo<memref<2048xi32, 0>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2048xi32, 2> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2>>
+      %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      %4 = amdaie.dma_cpy_nd(%2[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2>>, !amdaie.logicalobjectfifo<memref<2048xi32>>)
+      %5 = amdaie.core(%tile_0_2, in : [], out : []) {
+        %6 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      %7 = amdaie.core(%tile_1_2, in : [], out : []) {
+        %8 = amdaie.logicalobjectfifo.access(%2, Read) : !amdaie.logicalobjectfifo<memref<2048xi32, 2>> -> memref<2048xi32, 2>
+        amdaie.end
+      }
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    memref.dealloc %alloc : memref<2048xi32>
+    memref.dealloc %alloc_1 : memref<2048xi32, 2>
+    memref.dealloc %alloc_2 : memref<2048xi32, 2>
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_forall_to_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_forall_to_for.mlir
new file mode 100644
index 000000000..02f5e5120
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_forall_to_for.mlir
@@ -0,0 +1,132 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-controlcode-forall-to-for,canonicalize)" --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: @test_promotion
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   amdaie.controlcode
+// CHECK:         func.call @callee(%[[C0]], %[[C0]]) : (index, index) -> ()
+module @test_promotion {
+  func.func private @callee(%i: index, %j: index)
+  amdaie.workgroup {
+    amdaie.controlcode {
+      scf.forall (%i, %j) in (1, 1) {
+        func.call @callee(%i, %j) : (index, index) -> ()
+      }
+      amdaie.end
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @test_single
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   amdaie.controlcode
+// CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             func.call @callee(%[[ARG0]], %[[ARG1]]) : (index, index) -> ()
+module @test_single {
+  func.func private @callee(%i: index, %j: index)
+  amdaie.workgroup {
+    amdaie.controlcode {
+      scf.forall (%i, %j) in (2, 3) {
+        func.call @callee(%i, %j) : (index, index) -> ()
+      }
+      amdaie.end
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @test_multi
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   amdaie.controlcode
+// CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           scf.for %[[ARG1:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:             func.call @callee(%[[ARG0]], %[[ARG1]]) : (index, index) -> ()
+// CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             func.call @callee(%[[ARG0]], %[[ARG1]]) : (index, index) -> ()
+module @test_multi {
+  func.func private @callee(%i: index, %j: index)
+  amdaie.workgroup {
+    amdaie.controlcode {
+      scf.forall (%i, %j) in (3, 2) {
+        func.call @callee(%i, %j) : (index, index) -> ()
+      }
+      scf.forall (%i, %j) in (2, 3) {
+        func.call @callee(%i, %j) : (index, index) -> ()
+      }
+      amdaie.end
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @test_nested
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   amdaie.controlcode
+// CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:               scf.for %[[ARG3:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:                 func.call @callee(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]) : (index, index, index, index) -> ()
+module @test_nested {
+  func.func private @callee(%i: index, %j: index, %k: index, %l: index)
+  amdaie.workgroup {
+    amdaie.controlcode {
+      scf.forall (%i, %j) in (2, 3) {
+        scf.forall (%k, %l) in (2, 3) {
+          func.call @callee(%i, %j, %k, %l) : (index, index, index, index) -> ()
+        }
+      }
+      amdaie.end
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @test_affine_apply
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   amdaie.controlcode
+// CHECK:         scf.for %[[ARG0:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           %[[APPLY0:.+]] = affine.apply #map(%[[ARG0]])
+// CHECK:           scf.for %[[ARG1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             %[[APPLY1:.+]] = affine.apply #map(%[[ARG1]])
+// CHECK:             scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:               %[[APPLY2:.+]] = affine.apply #map(%[[ARG2]])
+// CHECK:               scf.for %[[ARG3:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:                 %[[APPLY3:.+]] = affine.apply #map(%[[ARG3]])
+// CHECK:                 func.call @callee(%[[APPLY0]], %[[APPLY1]], %[[APPLY2]], %[[APPLY3]]) : (index, index, index, index) -> ()
+#map = affine_map<(d0) -> (d0 * 32)>
+module @test_affine_apply {
+  func.func private @callee(%i: index, %j: index, %k: index, %l: index)
+  amdaie.workgroup {
+    amdaie.controlcode {
+      scf.forall (%i, %j) in (2, 3) {
+        scf.forall (%k, %l) in (2, 3) {
+          %0 = affine.apply #map(%i)
+          %1 = affine.apply #map(%j)
+          %2 = affine.apply #map(%k)
+          %3 = affine.apply #map(%l)
+          func.call @callee(%0, %1, %2, %3) : (index, index, index, index) -> ()
+        }
+      }
+      amdaie.end
+    }
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
index bfe6dc456..f7704d4db 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
@@ -100,7 +100,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000028
 // CHECK:       0x00000000
 // CHECK:       0x00000000
-// CHECK:       0x0001D21C
+// CHECK:       0x0601D21C
 // CHECK:       0x00000000
 // CHECK:       0x803F0002
 // CHECK:       0x00000018
@@ -127,7 +127,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000038
 // CHECK:       0x00000000
 // CHECK:       0x00000000
-// CHECK:       0x0001D214
+// CHECK:       0x0401D214
 // CHECK:       0x00000000
 // CHECK:       0x80FF000F
 // CHECK:       0x00000018
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
index 3a9e583bc..57a15c673 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
@@ -1,5 +1,19 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 
+// expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}}
+module {
+  func.func @no_amdaie_device() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
 // Check for unrolling an amdaie.core within a parallel loop with a single
 // induction variable with multiple iterations. There are no dma ops in this
 // check.
@@ -18,7 +32,8 @@
 // CHECK:         %{{.*}} = amdaie.core(%[[TILE_2]], in : [], out : [])
 // CHECK:         %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]])
 // CHECK:         %{{.*}} = amdaie.core(%[[TILE_3]], in : [], out : [])
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @distribute_cores_and_objectfifos_1x4() {
     %c2 = arith.constant 2 : index
     scf.forall (%arg0, %arg1) in (1, 1) {
@@ -50,7 +65,8 @@ module {
 // CHECK-DAG:     %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]], in : [], out : [])
 // CHECK-DAG:     %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
 // CHECK-DAG:     %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]], in : [], out : [])
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @distribute_cores_and_objectfifos_2x2() {
     scf.forall (%arg0, %arg1) in (1, 1) {
       scf.forall (%arg2, %arg3) in (2, 2) {
@@ -92,7 +108,8 @@ module {
 // CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_1]]], out : [])
 // CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @unroll_dma() {
     %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
@@ -142,7 +159,8 @@ module {
 // CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK:           linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @hoist_dma_single_loop() {
     %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
@@ -196,7 +214,8 @@ module {
 // CHECK-DAG:     amdaie.core(%[[TILE_0_3]], in : [%[[DMA_0]]], out : [])
 // CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 #map = affine_map<(d0) -> (d0 * 32)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @hoist_dma_and_affine_single_loop_2x1() {
     %c0_i32 = arith.constant 0 : i32
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
@@ -251,7 +270,8 @@ module {
 // CHECK-DAG:     amdaie.core(%[[TILE_0_3]], in : [%[[DMA_1]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 #map = affine_map<(d0) -> (d0 * 32)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @unroll_dma_and_affine_single_loop() {
     %c0_i32 = arith.constant 0 : i32
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
@@ -308,7 +328,8 @@ module {
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_0]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @hoist_dma_multi_loop() {
     %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
@@ -367,7 +388,8 @@ module {
 // CHECK-DAG:       amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 // CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_1]]], out : [])
 // CHECK-DAG:       amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @hoist_dma_one_of_multi_loop() {
     %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
@@ -440,7 +462,8 @@ module {
 // CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]], in : [%[[DMA_3]]], out : [])
 // CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>)
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @hoist_dma_dependencies() {
     %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
@@ -491,6 +514,8 @@ module {
 // CHECK-DAG:     %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]])
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK-DAG:     %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]])
+// CHECK-DAG:     %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
 // CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
 // CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]}
 // CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_1_3]]}
@@ -499,8 +524,8 @@ module {
 // CHECK-DAG:     %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]]}
 // CHECK-DAG:     %[[FROM_MEMREF_6:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_3]]}
 // CHECK-DAG:     %[[FROM_MEMREF_7:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_3]]}
-// CHECK-DAG:     %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]}
-// CHECK-DAG:     %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_0]]}
+// CHECK-DAG:     %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_1_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_1_0]]}
 // CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]]
 // CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]]
 // CHECK-DAG:     %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]]
@@ -529,7 +554,8 @@ module {
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>)
 // CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>)
 // CHECK-DAG:     %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]]
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @nested_dma_dependencies() {
     %c0_i32 = arith.constant 0 : i32
     %c1 = arith.constant 1 : index
@@ -589,23 +615,26 @@ module {
 //       CHECK:         linalg.fill ins(%[[C0]] : i32) outs(%[[ACCESS]] : memref<1x1x8x8x4x4xi32, 2 : i32>)
 //       CHECK:         amdaie.end
 //       CHECK:   memref.dealloc %[[ALLOC]] :
-func.func @l1_temporary_buffer_for_matmul_elem() {
-    %c0_i32 = arith.constant 0 : i32
-    %c2 = arith.constant 2 : index
-    %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
-    scf.forall (%arg0, %arg1) in (1, 1) {
-        scf.forall (%arg2, %arg3) in (1, 1) {
-        %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32>
-        %26 = arith.addi %arg2, %c2 : index
-        %tile = amdaie.tile(%arg3, %26)
-        %27 = amdaie.core(%tile, in : [], out : []) {
-            linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>)
-            amdaie.end
-        }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-    memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
-    return
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @l1_temporary_buffer_for_matmul_elem() {
+      %c0_i32 = arith.constant 0 : i32
+      %c2 = arith.constant 2 : index
+      %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+      scf.forall (%arg0, %arg1) in (1, 1) {
+          scf.forall (%arg2, %arg3) in (1, 1) {
+          %subview = memref.subview %alloc_6[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, 2 : i32>
+          %26 = arith.addi %arg2, %c2 : index
+          %tile = amdaie.tile(%arg3, %26)
+          %27 = amdaie.core(%tile, in : [], out : []) {
+              linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, 2 : i32>)
+              amdaie.end
+          }
+          } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+      memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
+      return
+  }
 }
 
 // -----
@@ -618,19 +647,22 @@ func.func @l1_temporary_buffer_for_matmul_elem() {
 // CHECK-SAME: to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>
 // CHECK-NOT: memref.subview
 // CHECK: return
-func.func @not_distributable() {
-  %cst = arith.constant 0.000000e+00 : bf16
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %alloc = memref.alloc() : memref<2x2x100xbf16, 2>
-  scf.forall (%arg0, %arg1) in (2, 2) {
-    scf.for %arg2 = %c0 to %c4 step %c1 {
-      %subview = memref.subview %alloc[%arg0, %arg1, %arg2] [1, 1, 10] [1, 1, 1] : memref<2x2x100xbf16, 2> to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>
-      linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>)
-    }
-  } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-  return
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @not_distributable() {
+    %cst = arith.constant 0.000000e+00 : bf16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %alloc = memref.alloc() : memref<2x2x100xbf16, 2>
+    scf.forall (%arg0, %arg1) in (2, 2) {
+      scf.for %arg2 = %c0 to %c4 step %c1 {
+        %subview = memref.subview %alloc[%arg0, %arg1, %arg2] [1, 1, 10] [1, 1, 1] : memref<2x2x100xbf16, 2> to memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>
+        linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x10xbf16, strided<[200, 100, 1], offset: ?>, 2>)
+      }
+    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+    return
+  }
 }
 
 
@@ -652,6 +684,7 @@ func.func @not_distributable() {
 // CHECK-DAG:         %[[TILE_0_1:.*]] = amdaie.tile(%c0, %c1)
 // CHECK-DAG:         %[[TILE_1_1:.*]] = amdaie.tile(%c1, %c1)
 // CHECK-DAG:         %[[TILE_0_0:.*]] = amdaie.tile(%c0, %c0)
+// CHECK-DAG:         %[[TILE_1_0:.*]] = amdaie.tile(%c1, %c0)
 // CHECK-DAG:         %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]}
 // CHECK-DAG:         %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_1]]}
 // CHECK-DAG:         %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_1]]}
@@ -663,7 +696,7 @@ func.func @not_distributable() {
 // CHECK-DAG:         %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_1_2]]}
 // CHECK-DAG:         %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_0_2]]}
 // CHECK-DAG:         %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_0_0]]}
-// CHECK-DAG:         %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_0_0]]}
+// CHECK-DAG:         %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_1_0]]}
 // CHECK-DAG:         %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_0_0]]}
 // CHECK-DAG:         %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]]
 // CHECK-SAME:        %[[FROM_MEMREF_11]]
@@ -702,7 +735,8 @@ func.func @not_distributable() {
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @distribute_cores_and_objectfifos() {
     %c2 = arith.constant 2 : index
     %c1024 = arith.constant 1024 : index
@@ -818,7 +852,8 @@ module {
 // CHECK-DAG:                 vector.transfer_write %[[CONTRACT]], %[[VAL_2]]
 // CHECK-DAG-SAME:                      [%[[C0]], %[[C0]], %[[ARG3]], %[[ARG2]], %[[C0]], %[[C0]]]
 // CHECK-DAG-SAME:                      in_bounds = [true, true, true, true, true, true]
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @distribute_cores_and_objectfifos_vectorization() {
     %c192 = arith.constant 192 : index
     %c32 = arith.constant 32 : index
@@ -918,7 +953,8 @@ module {
 // CHECK-DAG:           func.call @matmul_i32_i32
 // CHECK-DAG:           amdaie.end
 // CHECK-DAG:         } {elf_file = "/path/to/ukernel.o"}
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func private @matmul_i32_i32(memref<i32, 2 : i32>, index, memref<i32, 2 : i32>, index, memref<i32, 2 : i32>, index) attributes {link_with = "/path/to/ukernels.o", llvm.bareptr = true}
   func.func @distribute_cores_and_objectfifos_ukernel() {
     %c64 = arith.constant 64 : index
@@ -987,7 +1023,8 @@ module {
 // CHECK-SAME: ins(%[[ACCESS_1]] : memref<4x4xi32, 2 : i32>) outs(%[[SUBVIEW:.*]] : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @mixed_alloc_subview_operands() {
     %c2 = arith.constant 2 : index
     %c0_i32 = arith.constant 0 : i32
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
index 1d3c38d7a..efbd2b931 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_l1_allocations.mlir
@@ -90,4 +90,25 @@ func.func @non_distributing_subview(%index : index) {
 }
 
 
+// -----
+
+// Example where the subview type is a complete view of the alloc: unchanged IR.
 
+// CHECK-LABEL: @complete_view_subview
+// CHECK-NEXT:   memref.alloc() : memref<4xbf16, 2>
+// CHECK-NEXT:   scf.forall
+// CHECK-NEXT:      arith.constant
+// CHECK-NEXT:      memref.subview
+// CHECK-NEXT:      linalg.fill
+// CHECK-NEXT:   mapping = [#gpu.thread<x>]
+// CHECK-NEXT:   return
+
+func.func @complete_view_subview() {
+  %alloc = memref.alloc() : memref<4xbf16, 2>
+  scf.forall (%arg0) in (4) {
+    %c0_bf16 = arith.constant 0.000000e+00 : bf16
+    %subview = memref.subview %alloc[0] [4] [1] : memref<4xbf16, 2> to memref<4xbf16, 2>
+    linalg.fill ins(%c0_bf16 : bf16) outs(%subview : memref<4xbf16, 2>)
+  } {mapping = [#gpu.thread<x>]}
+  return
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir
index 7f0ae6d13..f266e861d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_to_air.mlir
@@ -64,9 +64,8 @@ func.func @func3() {
 // CHECK-LABEL: @func4
 // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<1x1x8x16xi32, 1>
 // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<8x16xi32>
-// CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, 1>
-// CHECK: %[[TRANSPOSE0:.*]] = memref.transpose %[[SUBVIEW0]] (d0, d1) -> (d0, d1) : memref<8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1>
-// CHECK: air.dma_memcpy_nd (%[[ALLOC1]][] [] [], %[[TRANSPOSE0]][] [] []) : (memref<8x16xi32>, memref<8x16xi32, strided<[16, 1]>, 1>)
+// CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1>
+// CHECK: air.dma_memcpy_nd (%[[ALLOC1]][] [] [], %[[SUBVIEW0]][] [] []) : (memref<8x16xi32>, memref<8x16xi32, strided<[16, 1]>, 1>)
 func.func @func4() {
   %alloc = memref.alloc() : memref<1x1x8x16xi32, 1>
   %alloc_0 = memref.alloc() : memref<8x16xi32>
@@ -79,9 +78,8 @@ func.func @func4() {
 // CHECK: scf.parallel (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) = 
 // CHECK: %[[SUBVIEW0:.*]] = memref.subview %[[ALLOC0]][%[[ARG0]], %[[ARG1]], %[[ARG2]]] [1, 8, 64] [1, 1, 1] : memref<32x8x64xf32> to memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>>
 // CHECK: %[[ALLOC1:.*]] = memref.alloc() : memref<1x1x1x8x64xf32, 1>
-// CHECK: %[[SUBVIEW1:.*]] = memref.subview %[[ALLOC1]][0, 0, 0, 0, 0] [1, 1, 1, 8, 64] [1, 1, 1, 1, 1] : memref<1x1x1x8x64xf32, 1> to memref<1x8x64xf32, 1>
-// CHECK: %[[TRANSPOSE0:.*]] = memref.transpose %[[SUBVIEW1]] (d0, d1, d2) -> (d0, d1, d2) : memref<1x8x64xf32, 1> to memref<1x8x64xf32, strided<[512, 64, 1]>, 1>
-// CHECK: air.dma_memcpy_nd (%[[SUBVIEW0]][] [] [], %[[TRANSPOSE0]][] [] []) : (memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>>, memref<1x8x64xf32, strided<[512, 64, 1]>, 1>)
+// CHECK: %[[SUBVIEW1:.*]] = memref.subview %[[ALLOC1]][0, 0, 0, 0, 0] [1, 1, 1, 8, 64] [1, 1, 1, 1, 1] : memref<1x1x1x8x64xf32, 1> to memref<1x8x64xf32, strided<[512, 64, 1]>, 1>
+// CHECK: air.dma_memcpy_nd (%[[SUBVIEW0]][] [] [], %[[SUBVIEW1]][] [] []) : (memref<1x8x64xf32, strided<[512, 64, 1], offset: ?>>, memref<1x8x64xf32, strided<[512, 64, 1]>, 1>)
 
 func.func @func5() {
   %c0 = arith.constant 0 : index
@@ -164,9 +162,8 @@ func.func @func6() {
       memref.dealloc %alloc_9 : memref<1x1x2x2x4x8xi32, 2>
       scf.reduce 
     }
-    // CHECK: %[[SUBVIEW8:.*]] = memref.subview %{{.*}}[0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, 1>
-    // CHECK: %[[TRANSPOSE5:.*]] = memref.transpose %[[SUBVIEW8]] (d0, d1) -> (d0, d1) : memref<8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1>
-    // CHECK: air.dma_memcpy_nd (%[[SUBVIEW2]][] [] [], %[[TRANSPOSE5]][] [] []) : (memref<8x16xi32, strided<[32, 1], offset: ?>>, memref<8x16xi32, strided<[16, 1]>, 1>)
+    // CHECK: %[[SUBVIEW8:.*]] = memref.subview %{{.*}}[0, 0, 0, 0] [1, 1, 8, 16] [1, 1, 1, 1] : memref<1x1x8x16xi32, 1> to memref<8x16xi32, strided<[16, 1]>, 1>
+    // CHECK: air.dma_memcpy_nd (%[[SUBVIEW2]][] [] [], %[[SUBVIEW8]][] [] []) : (memref<8x16xi32, strided<[32, 1], offset: ?>>, memref<8x16xi32, strided<[16, 1]>, 1>)
     iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %subview_1 : (memref<1x1x8x16xi32, 1> memref<8x16xi32, strided<[32, 1], offset: ?>>)
     memref.dealloc %alloc_2 : memref<1x1x16x16xi32, 1>
     memref.dealloc %alloc : memref<1x1x8x16xi32, 1>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
new file mode 100644
index 000000000..cb087421e
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -0,0 +1,159 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// Test of splitting matmul lhs input objectFifo and dma operations.
+
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 64 + 32)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 * 64)>
+// CHECK-label: func.func @split_L2_input_lhs
+//   CHECK-DAG:   %[[ALLOC_A0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[ALLOC_A1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A0]], {} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A1]], {} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #[[MAP1]](%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #[[MAP0]](%[[IV0]])
+//       CHECK:       %[[DMA_L3_TO_L2_A0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_A0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]
+//  CHECK-SAME:                                   {{.*}}[%[[IV0_0:.*]], 0] [32, 32] [128, 1]
+//       CHECK:       %[[DMA_L3_TO_L2_A1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_A1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]
+//  CHECK-SAME:                                   {{.*}}[%[[IV0_32:.*]], 0] [32, 32] [128, 1]
+//       CHECK:       %[[DMA_L2_TO_L1_A0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]
+//  CHECK-SAME:                                   %[[OBJ_L2_A0]][0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]
+//       CHECK:       %[[DMA_L2_TO_L1_A1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]
+//  CHECK-SAME:                                   %[[OBJ_L2_A1]][0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]
+//       CHECK:   memref.dealloc %[[ALLOC_A0]] : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[ALLOC_A1]] : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+module {
+  func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+    scf.forall (%arg1, %arg2) in (2, 2) {
+      %2 = affine.apply #map(%arg1)
+      %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %0[%2, 0] [64, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_0 : memref<2x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
+    return
+  }
+}
+
+// -----
+
+// Test of splitting matmul rhs input objectFifo and dma operations.
+
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 64 + 32)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 * 64)>
+// CHECK-label: func.func @split_L2_input_rhs
+//   CHECK-DAG:   %[[ALLOC_B0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[ALLOC_B1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   %[[OBJ_L2_B0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_B0]], {} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   %[[OBJ_L2_B1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_B1]], {} :
+//  CHECK-SAME:         memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #[[MAP1]](%[[IV1]])
+//   CHECK-DAG:       %[[IV1_32:.*]] = affine.apply #[[MAP0]](%[[IV1]])
+//       CHECK:       %[[DMA_L3_TO_L2_B0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_B0]][0, 0, 0, 0] [1, 32, 1, 32] [2048, 32, 1024, 1]
+//  CHECK-SAME:                                   {{.*}}[0, %[[IV1_0:.*]]] [32, 32] [128, 1]
+//       CHECK:       %[[DMA_L3_TO_L2_B1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_B1]][0, 0, 0, 0] [1, 32, 1, 32] [2048, 32, 1024, 1]
+//  CHECK-SAME:                                   {{.*}}[0, %[[IV1_32:.*]]] [32, 32] [128, 1]
+//       CHECK:       %[[DMA_L2_TO_L1_B0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1]
+//  CHECK-SAME:                                   %[[OBJ_L2_B0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//       CHECK:       %[[DMA_L2_TO_L1_B1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1]
+//  CHECK-SAME:                                   %[[OBJ_L2_B1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//       CHECK:   memref.dealloc %[[ALLOC_B0]] : memref<1x1x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[ALLOC_B1]] : memref<1x1x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+module {
+  func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg1, %arg2) in (2, 2) {
+      %2 = affine.apply #map(%arg2)
+      %3 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1], %1[0, %2] [32, 64] [128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
+      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
+    return
+  }
+}
+
+// -----
+
+// Test of splitting matmul output objectFifo and dma operations.
+
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 64)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 * 64 + 32)>
+// CHECK-label: func.func @split_L2_output
+//   CHECK-DAG:   %[[ALLOC_C0:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[ALLOC_C1:.*]] = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   %[[OBJ_L2_C0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_C0]], {} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   %[[OBJ_L2_C1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_C1]], {} :
+//  CHECK-SAME:         memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
+//       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2)
+//   CHECK-DAG:       %[[IV1_0:.*]] = affine.apply #[[MAP0]](%[[IV1]])
+//   CHECK-DAG:       %[[IV0_0:.*]] = affine.apply #[[MAP0]](%[[IV0]])
+//   CHECK-DAG:       %[[IV0_32:.*]] = affine.apply #[[MAP1]](%[[IV0]])
+//       CHECK:       %[[DMA_L1_TO_L2_C0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_C0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]
+//       CHECK:       %[[DMA_L1_TO_L2_C1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_C0]][0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]
+//       CHECK:       %[[DMA_L1_TO_L2_C3:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_C1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]
+//       CHECK:       %[[DMA_L1_TO_L2_C4:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   %[[OBJ_L2_C1]][0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]
+//  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]
+//       CHECK:       %[[DMA_L2_TO_L3_C0:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   {{.*}}[%[[IV0_0:.*]], %[[IV1_0:.*]]] [32, 64] [128, 1]
+//  CHECK-SAME:                                   %[[OBJ_L2_C0]][0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1]
+//       CHECK:       %[[DMA_L2_TO_L3_C1:.*]] = amdaie.dma_cpy_nd(
+//  CHECK-SAME:                                   {{.*}}[%[[IV0_32:.*]], %[[IV1_0:.*]]] [32, 64] [128, 1]
+//  CHECK-SAME:                                   %[[OBJ_L2_C1]][0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1]
+//       CHECK:   memref.dealloc %[[ALLOC_C0]] : memref<1x2x32x32xi32, 1 : i32>
+//       CHECK:   memref.dealloc %[[ALLOC_C1]] : memref<1x2x32x32xi32, 1 : i32>
+#map = affine_map<(d0) -> (d0 * 64)>
+module {
+  func.func @split_L2_output(%arg0: memref<128x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
+    scf.forall (%arg1, %arg2) in (2, 2) {
+      %2 = affine.apply #map(%arg2)
+      %3 = affine.apply #map(%arg1)
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %5 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %6 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
+    return
+  }
+}
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc
index c3dc89afe..3a654f27a 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.cc
@@ -486,7 +486,8 @@ std::optional<std::map<PathEndPoint, SwitchSettings>> Router::findPaths(
           auto curr = endPoint;
           // trace backwards until a vertex already processed is reached
           while (!processed.count(curr)) {
-            auto &sb = impl->graph[std::make_pair(preds[curr].tileLoc, curr.tileLoc)];
+            auto &sb =
+                impl->graph[std::make_pair(preds[curr].tileLoc, curr.tileLoc)];
             size_t i =
                 std::distance(sb.srcPorts.begin(),
                               std::find(sb.srcPorts.begin(), sb.srcPorts.end(),
@@ -870,7 +871,8 @@ FailureOr<std::tuple<MasterSetsT, SlaveAMSelsT>> emitPacketRoutingConfiguration(
   MasterSetsT mastersets;
   for (const auto &[physPort, ports] : masterAMSels) {
     for (Port port : ports) {
-      mastersets[PhysPort{physPort.first, port}].push_back(physPort.second);
+      mastersets[PhysPort{physPort.first, port, PhysPort::Direction::DST}]
+          .push_back(physPort.second);
     }
   }
 
@@ -883,6 +885,14 @@ FailureOr<std::tuple<MasterSetsT, SlaveAMSelsT>> emitPacketRoutingConfiguration(
 /// ================== stringification utils =============================
 /// ======================================================================
 
+std::string to_string(const PhysPort::Direction &direction) {
+  switch (direction) {
+    STRINGIFY_ENUM_CASE(PhysPort::Direction::SRC)
+    STRINGIFY_ENUM_CASE(PhysPort::Direction::DST)
+  }
+  llvm::report_fatal_error("Unhandled PhysPortDirection case");
+}
+
 std::string to_string(const SwitchSetting &setting) {
   return "SwitchSetting(" +
          llvm::join(
@@ -913,7 +923,7 @@ std::string to_string(const SwitchSettings &settings) {
 STRINGIFY_2TUPLE_STRUCT(Port, bundle, channel)
 STRINGIFY_2TUPLE_STRUCT(Connect, src, dst)
 STRINGIFY_2TUPLE_STRUCT(PathEndPoint, tileLoc, port)
-STRINGIFY_2TUPLE_STRUCT(PhysPort, tileLoc, port)
+STRINGIFY_3TUPLE_STRUCT(PhysPort, tileLoc, port, direction)
 STRINGIFY_2TUPLE_STRUCT(PhysPortAndID, physPort, id)
 
 BOTH_OSTREAM_OPS_FORALL_ROUTER_TYPES(OSTREAM_OP_DEFN, BOTH_OSTREAM_OP)
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h
index b30a1a147..449eb6ca2 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_router.h
@@ -23,7 +23,8 @@ struct Port {
 
   // mlir-air legacy
   Port() : bundle(), channel() {}
-  Port(StrmSwPortType b, int c) : bundle(b), channel(c) {}
+  Port(StrmSwPortType b, int c)
+      : bundle(b), channel(c) {}
   typedef std::tuple<StrmSwPortType, int> TupleType;
   Port(TupleType t) : Port(std::get<0>(t), std::get<1>(t)) {}
   operator TupleType() const { return {bundle, channel}; }
@@ -109,12 +110,16 @@ bool existsPathToDest(const SwitchSettings &settings, TileLoc currTile,
                       int finalDestChannel);
 
 struct PhysPort {
+  enum Direction { SRC, DST };
   TileLoc tileLoc;
   Port port;
-  PhysPort(TileLoc t, Port p) : tileLoc(t), port(p) {}
-  using TupleType = std::tuple<TileLoc, Port>;
-  PhysPort(TupleType t) : PhysPort(std::get<0>(t), std::get<1>(t)) {}
-  operator TupleType() const { return {tileLoc, port}; }
+  Direction direction;
+  PhysPort(TileLoc t, Port p, Direction direction)
+      : tileLoc(t), port(p), direction(direction) {}
+  using TupleType = std::tuple<TileLoc, Port, Direction>;
+  PhysPort(TupleType t)
+      : PhysPort(std::get<0>(t), std::get<1>(t), std::get<2>(t)) {}
+  operator TupleType() const { return {tileLoc, port, direction}; }
   TUPLE_LIKE_STRUCT_RELATIONAL_OPS(PhysPort)
 };
 
@@ -166,7 +171,9 @@ TO_STRINGS(TO_STRING_DECL)
   _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::Port)          \
   _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::SwitchSetting) \
   _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPort)      \
-  _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPortAndID)
+  _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPortAndID) \
+  _(OSTREAM_OP_, mlir::iree_compiler::AMDAIE::PhysPort::Direction)
+
 
 BOTH_OSTREAM_OPS_FORALL_ROUTER_TYPES(OSTREAM_OP_DECL, BOTH_OSTREAM_OP)
 
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
index 0af56fd20..a29fe0898 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
@@ -318,6 +318,22 @@ uint32_t AMDAIEDeviceModel::getMemTileSize(uint8_t col, uint8_t row) const {
   return devInst.DevProp.DevMod[static_cast<uint8_t>(tileType)].MemMod->Size;
 }
 
+SmallVector<uint32_t> AMDAIEDeviceModel::getMemSpaceRows(
+    uint8_t memSpace) const {
+  SmallVector<uint32_t> res;
+  if (memSpace == 0) {
+    res.resize(deviceConfig.shimTileNumRows);
+    std::iota(res.begin(), res.end(), configPtr.ShimRowNum);
+  } else if (memSpace == 1) {
+    res.resize(configPtr.MemTileNumRows);
+    std::iota(res.begin(), res.end(), configPtr.MemTileRowStart);
+  } else if (memSpace == 2) {
+    res.resize(configPtr.AieTileNumRows);
+    std::iota(res.begin(), res.end(), configPtr.AieTileRowStart);
+  }
+  return res;
+}
+
 bool AMDAIEDeviceModel::hasLegalMemAffinity(uint8_t coreCol, uint8_t coreRow,
                                             uint8_t memCol,
                                             uint8_t memRow) const {
@@ -483,6 +499,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) {
   switch (device) {
     case AMDAIEDevice::xcvc1902: {
       AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig;
+      deviceConfig.shimTileNumRows = XAIE1_SHIM_NUM_ROWS;
       deviceConfig.packetIdMaxIdx = XAIE1_PACKET_ID_MAX;
       deviceConfig.streamSwitchCoreArbiterMax = XAIE1_SS_ARBITER_MAX;
       deviceConfig.streamSwitchCoreMSelMax = XAIE1_SS_MSEL_MAX;
@@ -507,6 +524,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) {
     }
     case AMDAIEDevice::xcve2302: {
       AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig;
+      deviceConfig.shimTileNumRows = XAIEML_SHIM_NUM_ROWS;
       deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX;
       deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX;
       deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX;
@@ -530,6 +548,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) {
     }
     case AMDAIEDevice::xcve2802: {
       AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig;
+      deviceConfig.shimTileNumRows = XAIEML_SHIM_NUM_ROWS;
       deviceConfig.packetIdMaxIdx = XAIEML_PACKET_ID_MAX;
       deviceConfig.streamSwitchCoreArbiterMax = XAIEML_SS_ARBITER_MAX;
       deviceConfig.streamSwitchCoreMSelMax = XAIEML_SS_MSEL_MAX;
@@ -557,6 +576,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) {
     case AMDAIEDevice::npu1_3col:
     case AMDAIEDevice::npu1_4col: {
       AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig;
+      deviceConfig.shimTileNumRows = XAIE2IPU_SHIM_NUM_ROWS;
       deviceConfig.packetIdMaxIdx = XAIE2IPU_PACKET_ID_MAX;
       deviceConfig.streamSwitchCoreArbiterMax = XAIE2IPU_SS_ARBITER_MAX;
       deviceConfig.streamSwitchCoreMSelMax = XAIE2IPU_SS_MSEL_MAX;
@@ -603,6 +623,7 @@ struct AMDAIEDeviceModel getDeviceModel(AMDAIEDevice device) {
     }
     case AMDAIEDevice::npu4: {
       AMDAIEDeviceModel::AMDAIEDeviceConfig deviceConfig;
+      deviceConfig.shimTileNumRows = XAIE_STRIXB0_MEM_TILE_NUM_ROWS;
       deviceConfig.packetIdMaxIdx = XAIE_STRIXB0_PACKET_ID_MAX;
       deviceConfig.streamSwitchCoreArbiterMax = XAIE_STRIXB0_SS_ARBITER_MAX;
       deviceConfig.streamSwitchCoreMSelMax = XAIE_STRIXB0_SS_MSEL_MAX;
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
index a45f798ad..144e1dc62 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
@@ -201,7 +201,7 @@ inline ::XAie_TxnOpcode txnToTxn(XAie_TxnOpcode t) {
 }
 
 // mlir-air legacy
-enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2 };
+enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2, AIE2p = 3 };
 
 /*
  * This struct is meant to be a thin wrapper around aie-rt, which provides
@@ -228,6 +228,12 @@ struct AMDAIEDeviceModel {
   /// retrieved in another way before adding new fields to this struct.
 
   struct AMDAIEDeviceConfig {
+    ///////////////////////////////////////
+    // AIE Array configuration constants //
+    ///////////////////////////////////////
+    /// The number of shim tile rows. Not found in aie-rt data structures, but
+    /// provided as `XAIE_SHIM_NUM_ROWS`.
+    uint8_t shimTileNumRows{1};
     /// Set default minimum stride bitwidth/addressing granularity to 32 bits as
     /// this is the value for all current architecture versions.
     uint8_t minStrideBitWidth{32};
@@ -334,6 +340,8 @@ struct AMDAIEDeviceModel {
   uint32_t getMemTileSize(uint8_t col, uint8_t row) const;
   uint32_t getCoreTileLocalMemorySize() const;
 
+  SmallVector<uint32_t> getMemSpaceRows(uint8_t memSpace) const;
+
   uint32_t getNumBDs(uint8_t col, uint8_t row) const;
 
   uint32_t getNumSourceSwitchBoxConnections(uint8_t col, uint8_t row,
@@ -356,7 +364,9 @@ struct AMDAIEDeviceModel {
     return deviceConfig.vectorLoadStoreAlignmentBits;
   }
 
-  uint32_t getMaxVectorSizeBits() const { return deviceConfig.maxVectorSizeBits; }
+  uint32_t getMaxVectorSizeBits() const {
+    return deviceConfig.maxVectorSizeBits;
+  }
 
   uint32_t getShiftOperandBits() const { return deviceConfig.shiftOperandBits; }
 
diff --git a/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc b/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc
index 655d33bc3..635c4eaa8 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/test/test_amsel_generator.cc
@@ -16,8 +16,9 @@ std::pair<uint8_t, uint8_t> amsel(uint8_t a, uint8_t msel) {
 TEST(AMSelGeneratorTest, TileNotInitialized) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
+  PhysPortAndID src1 = {
+      {{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(failed(generator.addConnection(tileLoc, src1, {dst1})));
 }
 
@@ -25,8 +26,8 @@ TEST(AMSelGeneratorTest, NoArbitersNoMSels) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 0, 0);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(failed(generator.solve()));
 }
@@ -35,8 +36,8 @@ TEST(AMSelGeneratorTest, NoArbiters) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 0, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(failed(generator.solve()));
 }
@@ -45,8 +46,8 @@ TEST(AMSelGeneratorTest, NoMSels) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 0);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(failed(generator.solve()));
 }
@@ -55,19 +56,19 @@ TEST(AMSelGeneratorTest, SingleSrcSingleDst) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
   for (int i = 1; i < 6; i++) {
-    PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, i}}, i};
-    PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, i}}, i};
+    PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i};
+    PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, i}, PhysPort::Direction::DST}, i};
     EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2})));
   }
   EXPECT_TRUE(succeeded(generator.solve()));
   for (int i = 0; i < 6; i++) {
-    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, i}}, i};
+    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, i}, PhysPort::Direction::SRC}, i};
     EXPECT_EQ(generator.getAMSel(tileLoc, src).value(), amsel(i, 0));
   }
 }
@@ -79,13 +80,13 @@ TEST(AMSelGeneratorTest, SingleSrcSingleDstSamePorts) {
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
   for (int i = 0; i < 6; i++) {
-    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, i};
-    PhysPortAndID dst = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, i};
+    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i};
+    PhysPortAndID dst = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, i};
     EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src, {dst})));
   }
   EXPECT_TRUE(succeeded(generator.solve()));
   for (int i = 0; i < 6; i++) {
-    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, i};
+    PhysPortAndID src = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, i};
     EXPECT_EQ(generator.getAMSel(tileLoc, src).value(), amsel(0, 0));
   }
 }
@@ -94,15 +95,15 @@ TEST(AMSelGeneratorTest, SingleSrcMultiDst) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::EAST, 0}}, 0};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 1};
-  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::EAST, 1}}, 1};
+  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::DST}, 1};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(1, 0));
@@ -113,19 +114,19 @@ TEST(AMSelGeneratorTest, MultiSrcSingleDst) {
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
   // Reuse msels for multiple sources.
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 0};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst1})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
   EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(0, 0));
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 1};
-  PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::EAST, 0}}, 1};
-  PhysPortAndID src5 = {{{0, 1}, {StrmSwPortType::EAST, 1}}, 1};
-  PhysPortAndID src6 = {{{0, 1}, {StrmSwPortType::EAST, 2}}, 1};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 0};
+  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::EAST, 0}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src5 = {{{0, 1}, {StrmSwPortType::EAST, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src6 = {{{0, 1}, {StrmSwPortType::EAST, 2}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src4, {dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src5, {dst2})));
@@ -143,23 +144,23 @@ TEST(AMSelGeneratorTest, MultiSrcMultiDst) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 6, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 0};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 1};
-  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 2}}, 1};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4})));
   EXPECT_TRUE(succeeded(generator.solve()));
   EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
   EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(0, 1));
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 2};
-  PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 3};
-  PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::WEST, 0}}, 2};
-  PhysPortAndID dst6 = {{{0, 1}, {StrmSwPortType::WEST, 1}}, 2};
-  PhysPortAndID dst7 = {{{0, 1}, {StrmSwPortType::WEST, 0}}, 3};
-  PhysPortAndID dst8 = {{{0, 1}, {StrmSwPortType::WEST, 2}}, 3};
+  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 2};
+  PhysPortAndID src4 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 3};
+  PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 2};
+  PhysPortAndID dst6 = {{{0, 1}, {StrmSwPortType::WEST, 1}, PhysPort::Direction::DST}, 2};
+  PhysPortAndID dst7 = {{{0, 1}, {StrmSwPortType::WEST, 0}, PhysPort::Direction::DST}, 3};
+  PhysPortAndID dst8 = {{{0, 1}, {StrmSwPortType::WEST, 2}, PhysPort::Direction::DST}, 3};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst5, dst6})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src4, {dst7, dst8})));
   EXPECT_TRUE(succeeded(generator.solve()));
@@ -173,14 +174,14 @@ TEST(AMSelGeneratorTest, ReuseArbiters) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 1, 4);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1};
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}}, 2};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 0};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}}, 1};
-  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 3}}, 1};
-  PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::NORTH, 4}}, 2};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst4 = {{{0, 1}, {StrmSwPortType::NORTH, 3}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst5 = {{{0, 1}, {StrmSwPortType::NORTH, 4}, PhysPort::Direction::DST}, 2};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1, dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst3, dst4})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst5})));
@@ -194,18 +195,33 @@ TEST(AMSelGeneratorTest, ReuseArbitersFailure) {
   AMSelGenerator generator;
   TileLoc tileLoc(0, 1);
   generator.initTileIfNotExists(tileLoc, 1, 2);
-  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}}, 0};
-  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}}, 1};
-  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}}, 2};
-  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}}, 0};
-  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}}, 1};
-  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}}, 2};
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::SOUTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::SOUTH, 1}, PhysPort::Direction::SRC}, 1};
+  PhysPortAndID src3 = {{{0, 1}, {StrmSwPortType::SOUTH, 2}, PhysPort::Direction::SRC}, 2};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::NORTH, 1}, PhysPort::Direction::DST}, 1};
+  PhysPortAndID dst3 = {{{0, 1}, {StrmSwPortType::NORTH, 2}, PhysPort::Direction::DST}, 2};
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2})));
   EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src3, {dst3})));
   EXPECT_TRUE(failed(generator.solve()));
 }
 
+TEST(AMSelGeneratorTest, DifferentDirections) {
+  AMSelGenerator generator;
+  TileLoc tileLoc(0, 1);
+  generator.initTileIfNotExists(tileLoc, 6, 4);
+  PhysPortAndID src1 = {{{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst1 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::DST}, 0};
+  PhysPortAndID src2 = {{{0, 1}, {StrmSwPortType::NORTH, 0}, PhysPort::Direction::SRC}, 0};
+  PhysPortAndID dst2 = {{{0, 1}, {StrmSwPortType::DMA, 0}, PhysPort::Direction::DST}, 0};
+  EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src1, {dst1})));
+  EXPECT_TRUE(succeeded(generator.addConnection(tileLoc, src2, {dst2})));
+  EXPECT_TRUE(succeeded(generator.solve()));
+  EXPECT_EQ(generator.getAMSel(tileLoc, src1).value(), amsel(0, 0));
+  EXPECT_EQ(generator.getAMSel(tileLoc, src2).value(), amsel(1, 0));
+}
+
 }  // namespace mlir::iree_compiler::AMDAIE
 
 int main(int argc, char **argv) {
diff --git a/third_party/mlir-air b/third_party/mlir-air
index 3bab1025d..f3884b6d0 160000
--- a/third_party/mlir-air
+++ b/third_party/mlir-air
@@ -1 +1 @@
-Subproject commit 3bab1025d02ffd2b14c0e887bb3749b4836936b2
+Subproject commit f3884b6d0e1910424d47f2310bba4666dd6c8105