From 03830929697464666b58be717ece8328bc6c6965 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:28:15 +0000
Subject: [PATCH 01/10] [Fix] InferDuplicateStreamsLayer now properly handles
 forks of multiple-output nodes

---
 .../fpgadataflow/convert_to_hw_layers.py      | 96 +++++++++----------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 25a2032aeb..b02bc89db8 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -585,63 +585,63 @@ def apply(self, model):
 
         for node in graph.node:
             node_ind += 1
-            successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) >= 2:
-                output_tensor = node.output[0]
-                n_outputs = len(successors)
+            for output_tensor in node.output:
+                successors = model.find_consumers(output_tensor)
+                if successors is not None and len(successors) >= 2:
+                    n_outputs = len(successors)
 
-                dt = model.get_tensor_datatype(output_tensor)
+                    dt = model.get_tensor_datatype(output_tensor)
 
-                # skip conversion for layers with float input
-                if not dt.is_integer():
-                    continue
+                    # skip conversion for layers with float input
+                    if not dt.is_integer():
+                        continue
 
-                # create clone tensors
-                out_shape = model.get_tensor_shape(output_tensor)
-                out_tensor_clones = []
-                for i in range(n_outputs):
-                    clone = helper.make_tensor_value_info(
-                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                    )
-                    model.graph.value_info.append(clone)
-                    out_tensor_clones += [clone.name]
+                    # create clone tensors
+                    out_shape = model.get_tensor_shape(output_tensor)
+                    out_tensor_clones = []
+                    for i in range(n_outputs):
+                        clone = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                        )
+                        model.graph.value_info.append(clone)
+                        out_tensor_clones += [clone.name]
 
-                num_ch = int(out_shape[-1])
-                vecs = out_shape[:-1]
+                    num_ch = int(out_shape[-1])
+                    vecs = out_shape[:-1]
 
-                # create node with no parallelization first
-                pe = 1
+                    # create node with no parallelization first
+                    pe = 1
 
-                dup_node = helper.make_node(
-                    "DuplicateStreams",
-                    [output_tensor],
-                    out_tensor_clones,
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    NumChannels=num_ch,
-                    PE=pe,
-                    inputDataType=dt.name,
-                    numInputVectors=vecs,
-                    NumOutputStreams=n_outputs,
-                    outFIFODepths=[2] * n_outputs,
-                    name="DuplicateStreams_" + node.name,
-                )
+                    dup_node = helper.make_node(
+                        "DuplicateStreams",
+                        [output_tensor],
+                        out_tensor_clones,
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=num_ch,
+                        PE=pe,
+                        inputDataType=dt.name,
+                        numInputVectors=vecs,
+                        NumOutputStreams=n_outputs,
+                        outFIFODepths=[2] * n_outputs,
+                        name="DuplicateStreams_" + node.name,
+                    )
 
-                graph.node.insert(node_ind, dup_node)
+                    graph.node.insert(node_ind, dup_node)
 
-                # connect successors to out tensor clone
-                clone_idx = 0
-                for successor in successors:
-                    for i, succ_input in enumerate(successor.input):
-                        if succ_input == output_tensor:
-                            successor.input[i] = out_tensor_clones[clone_idx]
-                            clone_idx += 1
-                            # if one node has multiple connections to the same output
-                            # find_direct_successors will return one node per input
-                            # so break the inner loop will result in correct behaviour
-                            break
+                    # connect successors to out tensor clone
+                    clone_idx = 0
+                    for successor in successors:
+                        for i, succ_input in enumerate(successor.input):
+                            if succ_input == output_tensor:
+                                successor.input[i] = out_tensor_clones[clone_idx]
+                                clone_idx += 1
+                                # if one node has multiple connections to the same output
+                                # find_direct_successors will return one node per input
+                                # so break the inner loop will result in correct behaviour
+                                break
 
-                graph_modified = True
+                    graph_modified = True
 
         if graph_modified:
             model = model.transform(SortGraph())

From d13aa7e7debb21bd1d75b6dbb6eddc959b4ae8c8 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:48:43 +0000
Subject: [PATCH 02/10] [Fix] MoveScalarLinearPastInvariants, MakeMaxPoolNHWC,
 MakeScaleResizeNHWC transformations are checking whether the node to be moved
 is a fork node, in which case the MoveOpPastFork is called. MoveOpPastFork
 uses deepcopies of the original node.

---
 src/finn/transformation/streamline/reorder.py | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 8ac2d7dad6..9a7e9d0723 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -29,6 +29,7 @@
 import numpy as np
 import qonnx.core.data_layout as DataLayout
 import warnings
+from copy import deepcopy
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.datatype import DataType
@@ -641,6 +642,10 @@ def apply(self, model):
                     # if initializer is not scalar, skip
                     if np.prod(init0.shape) != 1:
                         continue
+                    if model.is_fork_node(prod0):
+                        model = model.transform(MoveOpPastFork(prod0.op_type))
+                        # topology modified, "ask" ModelWrapper to apply this transform again
+                        return (model, True)
                     # Flatten input if required
                     if len(init0.shape) > 0:
                         init0 = init0.flatten()[0]
@@ -713,6 +718,12 @@ def apply(self, model):
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         ceil_mode = get_by_name(n.attribute, "ceil_mode")
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
@@ -764,6 +775,12 @@ def apply(self, model):
                 if producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         old_value = model.get_initializer(n.input[scales_ind])
                         new_value = np.array(
                             [old_value[idx] for idx in (0, 2, 3, 1)],
@@ -813,10 +830,9 @@ class MoveOpPastFork(Transformation):
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
+    def __init__(self, op_name_list):
         super().__init__()
         self.ops_to_move = op_name_list
-        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -859,11 +875,9 @@ def apply(self, model):
                         new_param_name = model.make_new_valueinfo_name()
                         new_inp_list = [n.input[0], new_param_name]
                         model.set_initializer(new_param_name, op_init_param)
-                    attrs = self.get_attrs_fxn(n)
-                    # TODO use copy of original node instead to get attrs?
-                    new_node = oh.make_node(
-                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
-                    )
+                    new_node = deepcopy(n)
+                    new_node.input[:] = new_inp_list
+                    new_node.output[:] = [new_output_tensor_name]
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
 
@@ -901,7 +915,7 @@ def __init__(self):
 
 class MoveTransposePastFork(MoveOpPastFork):
     def __init__(self):
-        super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints})
+        super().__init__(["Transpose"])
 
 
 class MoveMaxPoolPastMultiThreshold(Transformation):

From 6223abe86c7d9aee43788825f3c19545dab0ea54 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 13:59:14 +0000
Subject: [PATCH 03/10] [Fix] InsertFIFO transform is fixed for the case of the
 last node in the graph being a fork node

---
 src/finn/transformation/fpgadataflow/insert_fifo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 21fb843052..9ed0f51cd4 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -268,7 +268,7 @@ def apply(self, model):
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
                             n0_tensor_dtype,
-                            n0.get_normal_output_shape(),
+                            n0.get_normal_output_shape(out_ind),
                         )
                         graph.value_info.append(fifo_input_tensor)
                         model.set_tensor_datatype(fifo_input_tensor.name, dtype)
@@ -294,7 +294,7 @@ def apply(self, model):
                         graph.node.append(fifo_node)
 
                         # set fifo output tensor as new input tensor of second node
-                        final_node.output[0] = fifo_input_tensor.name
+                        final_node.output[out_ind] = fifo_input_tensor.name
                     else:
                         warnings.warn(
                             """Output FIFO for %s has depth %d and won't

From d7c9391e36102588e1b5cc9b46d132633c9e4267 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 16:44:54 +0000
Subject: [PATCH 04/10] [Feature] Timeout template added

---
 src/finn/custom_op/fpgadataflow/hlsbackend.py | 14 ++++++
 src/finn/custom_op/fpgadataflow/templates.py  | 45 +++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..c03a9029db 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -474,3 +474,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..7ef74118ec 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -58,6 +58,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file

From 6da0ce4d10db86f2eea3bb513164c752401956d8 Mon Sep 17 00:00:00 2001
From: mdaniowi <mdaniowi@amd.com>
Date: Fri, 20 Sep 2024 16:02:40 +0100
Subject: [PATCH 05/10] [Feature] npy2vectorstream.hpp include added to
 docompute_template

---
 src/finn/custom_op/fpgadataflow/templates.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 7ef74118ec..d2100a7516 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 

From 8f87454c45c688496d6e4e1650229e81e8417867 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:05:34 +0000
Subject: [PATCH 06/10] [Feature] New Split custom_op added

---
 src/finn/custom_op/fpgadataflow/__init__.py   |   2 +
 .../custom_op/fpgadataflow/hls/__init__.py    |   2 +
 .../custom_op/fpgadataflow/hls/split_hls.py   | 278 ++++++++++++++++++
 src/finn/custom_op/fpgadataflow/split.py      | 164 +++++++++++
 4 files changed, 446 insertions(+)
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/split_hls.py
 create mode 100644 src/finn/custom_op/fpgadataflow/split.py

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..6f48bc6308 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -42,6 +42,7 @@
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.pool import Pool
+from finn.custom_op.fpgadataflow.split import StreamingSplit
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
 )
@@ -77,6 +78,7 @@
 custom_op["Lookup"] = Lookup
 custom_op["Pool"] = Pool
 custom_op["StreamingConcat"] = StreamingConcat
+custom_op["StreamingSplit"] = StreamingSplit
 custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter
 custom_op["StreamingEltwise"] = StreamingEltwise
 custom_op["StreamingMaxPool"] = StreamingMaxPool
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..e5b24413eb 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -43,6 +43,7 @@
 from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls
 from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls
 from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls
+from finn.custom_op.fpgadataflow.hls.split_hls import StreamingSplit_hls
 from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import (
     StreamingDataWidthConverter_hls,
 )
@@ -71,6 +72,7 @@
 custom_op["Lookup_hls"] = Lookup_hls
 custom_op["Pool_hls"] = Pool_hls
 custom_op["StreamingConcat_hls"] = StreamingConcat_hls
+custom_op["StreamingSplit_hls"] = StreamingSplit_hls
 custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls
 custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls
 custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/split_hls.py b/src/finn/custom_op/fpgadataflow/hls/split_hls.py
new file mode 100644
index 0000000000..d6f9d43f51
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/split_hls.py
@@ -0,0 +1,278 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow import templates
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.split import StreamingSplit
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingSplit_hls(StreamingSplit, HLSBackend):
+    """Streaming split node with dynamically generated HLS.
+    Only supports splitting along the last axis."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(StreamingSplit.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        ishape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+        n_outputs = self.get_n_outputs()
+        exp_oshapes = [self.get_normal_output_shape(i) for i in range(len(node.output))]
+        export_idt = self.get_input_datatype()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == ishape, "Input shape mismatch for " + node.input[0]
+        # reshape input into folded form
+        inp = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = inp.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_outputs(context, ["output_%d.npy" % i for i in range(n_outputs)])
+            for i in range(n_outputs):
+                assert (
+                    context[node.output[i]].shape == exp_oshapes[i]
+                ), "cppsim did not produce expected folded output shape: {}, expected: {}".format(
+                    context[node.output[i]].shape, exp_oshapes[i]
+                )
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            io_dict = {"inputs": {}, "outputs": {}}
+
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "%s/input_0.npy" % code_gen_dir,
+                export_idt,
+                nbits,
+                # reverse_inner=True,
+            )
+            io_dict["inputs"]["in0"] = rtlsim_inp
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+
+            for i in range(n_outputs):
+                io_dict["outputs"]["out_arr_%d" % i] = []
+            self.rtlsim_multi_io(sim, io_dict, sname="_")
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            for i in range(n_outputs):
+                out_npy_path = "%s/output_%d.npy" % (code_gen_dir, i)
+                out_shape = self.get_folded_output_shape(i)
+                rtlsim_output_to_npy(
+                    io_dict["outputs"]["out_arr_%d" % i],
+                    out_npy_path,
+                    odt,
+                    out_shape,
+                    packed_bits,
+                    target_bits,
+                    # reverse_inner=True,
+                )
+                # load and reshape output
+                output = np.load(out_npy_path)
+                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshapes[i])
+                context[node.output[i]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        for i in range(n_outputs):
+            assert (
+                context[node.output[i]].shape == exp_oshapes[i]
+            ), "cppsim did not produce expected folded output shape. Got: {}, expected: {}".format(
+                context[node.output[i]].shape, exp_oshapes[i]
+            )
+
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.read_npy_data()
+        self.strm_decl()
+        self.pragmas()
+        self.docompute()
+        self.dataoutstrm()
+        self.save_as_npy()
+        self.timeout_value()
+        self.timeout_condition()
+        self.timeout_read_stream()
+
+        template = templates.docompute_template_timeout
+
+        for key in self.code_gen_dict:
+            # transform list into long string separated by '\n'
+            code_gen_line = "\n".join(self.code_gen_dict[key])
+            template = template.replace(key, code_gen_line)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+        f.write(template)
+        f.close()
+        self.code_gen_dict.clear()
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "split.hpp"']
+
+    def defines(self, var):
+        self.code_gen_dict["$DEFINES$"] = ["#define NUM_OUTPUTS " + str(self.get_n_outputs())]
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        simd = self.get_nodeattr("SIMD")
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        npy_in = "%s/input_0.npy" % code_gen_dir
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2vectorstream<%s, %s, %d>("%s", in0);'
+            % (input_elem_hls_type, npy_type, simd, npy_in)
+        )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        simd = self.get_nodeattr("SIMD")
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        stream_name = "in0"
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<%s, %d>> %s ("%s");'
+            % (input_elem_hls_type, simd, stream_name, stream_name)
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            "hls::stream<hls::vector<{}, {}>> out_arr[NUM_OUTPUTS];".format(
+                self.get_output_datatype().get_hls_datatype_str(), simd
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            "hls::stream<hls::vector<{}, {}>> debug_out_arr[NUM_OUTPUTS];".format(
+                self.get_output_datatype().get_hls_datatype_str(), simd
+            )
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+        n_outputs = self.get_n_outputs()
+        output_folds = [str(self.get_folded_output_shape(i)[-2]) for i in range(n_outputs)]
+        out_stream_folds = ", ".join(output_folds)
+        comp_call = "StreamingSplit<{}>(in0, out_arr);".format(out_stream_folds)
+        self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
+
+    def dataoutstrm(self):
+        npy_type = "float"
+        simd = self.get_nodeattr("SIMD")
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        n_outputs = self.get_n_outputs()
+        self.code_gen_dict["$DATAOUTSTREAM$"] = []
+        for i in range(n_outputs):
+            oshape = self.get_folded_output_shape(i)
+            oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+            npy_out = "%s/output_%d.npy" % (code_gen_dir, i)
+            self.code_gen_dict["$DATAOUTSTREAM$"].append(
+                'vectorstream2npy<%s, %s, %d>(debug_out_arr[%d], %s, "%s");'
+                % (
+                    self.get_output_datatype(i).get_hls_datatype_str(),
+                    npy_type,
+                    simd,
+                    i,
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            )
+
+    def blackboxfunction(self):
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        simd = self.get_nodeattr("SIMD")
+        in_stream = "hls::stream<hls::vector<%s, %d>> &in0" % (input_elem_hls_type, simd)
+        out_streams = "hls::stream<hls::vector<%s, %d>> (&out_arr)[NUM_OUTPUTS]" % (
+            input_elem_hls_type,
+            simd,
+        )
+        blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_stream, out_streams)
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
+
+    def pragmas(self):
+        pragmas = []
+        pragmas.append("#pragma HLS INTERFACE axis port=in0")
+        for i in range(self.get_n_outputs()):
+            pragmas.append("#pragma HLS INTERFACE axis port=out_arr[%d]" % i)
+        pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+        pragmas.append("#pragma HLS aggregate variable=in0 compact=bit")
+        for i in range(self.get_n_outputs()):
+            pragmas.append("#pragma HLS aggregate variable=out_arr[%d] compact=bit" % i)
+        self.code_gen_dict["$PRAGMAS$"] = pragmas
+
+    def timeout_condition(self):
+        condition = []
+        for i in range(self.get_n_outputs()):
+            condition.append("out_arr[{}].empty()".format(i))
+        condition = " && ".join(condition)
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = [condition]
+
+    def timeout_read_stream(self):
+        read_stream_command = """
+for(int i = 0; i < NUM_OUTPUTS; i++){
+    if(!out_arr[i].empty())
+         debug_out_arr[i] << out_arr[i].read();
+}
+"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [read_stream_command]
diff --git a/src/finn/custom_op/fpgadataflow/split.py b/src/finn/custom_op/fpgadataflow/split.py
new file mode 100644
index 0000000000..e6ec551bc4
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/split.py
@@ -0,0 +1,164 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from onnx import helper
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import roundup_to_integer_multiple
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class StreamingSplit(HWCustomOp):
+    """Abstraction layer for HW implementation of Split.
+    Only supports splitting along the last (channel) axis."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "SIMD": ("i", True, 0),
+            # number of elements of each output streams
+            "ChannelsPerStream": ("ints", True, []),
+            # FINN DataTypes for input; output datatypes inferred from input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors for non-split axes, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_n_outputs(self):
+        return len(self.get_nodeattr("ChannelsPerStream"))
+
+    def get_total_elems(self):
+        elems_per_stream = self.get_nodeattr("ChannelsPerStream")
+        return int(np.sum(elems_per_stream))
+
+    def get_normal_input_shape(self, ind=0):
+        total_elems = self.get_total_elems()
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [total_elems])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        simd = self.get_nodeattr("SIMD")
+        folds = self.get_total_elems() // simd
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [folds, simd])
+
+    def get_normal_output_shape(self, ind=0):
+        elems = self.get_nodeattr("ChannelsPerStream")[ind]
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [elems])
+
+    def get_folded_output_shape(self, ind=0):
+        elems = self.get_nodeattr("ChannelsPerStream")[ind]
+        simd = self.get_nodeattr("SIMD")
+        folds = elems // simd
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [folds, simd])
+
+    def make_shape_compatible_op(self, model):
+        # check input shape
+        exp_ishape = self.get_normal_input_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape"
+
+        assert len(self.onnx_node.output) == self.get_n_outputs(), "Unexpected number of outputs"
+        ret = helper.make_node("Split", self.onnx_node.input, self.onnx_node.output, axis=-1)
+        return ret
+
+    def infer_node_datatype(self, model):
+        # check input datatype
+        inp = self.onnx_node.input[0]
+        idt = model.get_tensor_datatype(inp)
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                self.onnx_node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+            self.set_nodeattr("inputDataType", idt.name)
+        odt = self.get_output_datatype()
+        for out in self.onnx_node.output:
+            model.set_tensor_datatype(out, odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        # all output datatypes are the same as the input datatype
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        return ibits * self.get_nodeattr("SIMD")
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        out_width = obits * self.get_nodeattr("SIMD")
+        return out_width
+
+    def get_number_output_values(self):
+        num_output_values = 0
+        for i in range(self.get_n_outputs()):
+            num_output_values += np.prod(self.get_folded_output_shape(i)[:-1])
+        return num_output_values
+
+    def get_exp_cycles(self):
+        return np.prod(self.get_folded_input_shape()[:-1])
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        split = self.get_nodeattr("ChannelsPerStream")
+        np_split_param = np.cumsum(split[:-1])
+        np_result = np.split(context[node.input[0]], np_split_param, axis=-1)
+        for i, out in enumerate(node.output):
+            context[out] = np_result[i]
+
+    def get_instream_width_padded(self, ind=0):
+        in_width = self.get_instream_width()
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["s_axis"] = [("in0", self.get_instream_width_padded())]
+        intf_names["m_axis"] = []
+        for i in range(self.get_n_outputs()):
+            intf_names["m_axis"].append(("out_arr_%d" % i, self.get_instream_width_padded()))
+        return intf_names

From 8ea47f37f288195564b908c7a374b1ce913ef450 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:08:54 +0000
Subject: [PATCH 07/10] [Feature] Change signal name option added to
 hwcustomop.rtlsim_multi_io, useful for array interfaces

---
 src/finn/custom_op/fpgadataflow/hwcustomop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index b40b8f3074..602a923424 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -284,11 +284,11 @@ def rtlsim(self, sim, inp, inp2=None):
             sim.stop_vcd_trace()
         return outputs
 
-    def rtlsim_multi_io(self, sim, io_dict):
+    def rtlsim_multi_io(self, sim, io_dict, sname=None):
         "Run rtlsim for this node, supports multiple i/o streams."
 
         # signal name
-        sname = "_" + self.hls_sname() + "_"
+        sname = "_" + self.hls_sname() + "_" if sname is None else sname
 
         trace_file = self.get_nodeattr("rtlsim_trace")
         if trace_file == "default":

From 59cfce74a4ba3788d0bf0596a6b0976ea5a030a0 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:11:43 +0000
Subject: [PATCH 08/10] [Feature] InferSplitlayer() added

---
 .../fpgadataflow/convert_to_hw_layers.py      | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index b02bc89db8..e4f10af3eb 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1250,6 +1250,72 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+class InferSplitLayer(Transformation):
+    """Convert suitable Split nodes (operating on last/-1 axis)
+    into StreamingConcat HW layers."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Split":
+                split_param = node.input[1]
+                if model.get_initializer(split_param) is None:
+                    warnings.warn("Split param not constant, skipping InferSplitLayer()")
+                    continue
+                ishape = model.get_tensor_shape(node.input[0])
+                axis = get_by_name(node.attribute, "axis")
+                if (axis is None) or (ishape is None):
+                    continue
+                axis = axis.i
+                last_axis = len(ishape) - 1
+                # skip conversion if not using last axis
+                if (axis != -1) and (axis != last_axis):
+                    warnings.warn(
+                        "StreamingSplit supports only last axis, skipping InferSplitLayer()"
+                    )
+                    continue
+                # only one input allowed (two including split_param)
+                if len(node.input) != 2:
+                    warnings.warn("Only one input allowed, skipping InferSplitLayer()")
+                    continue
+                # skip conversion if the input is static
+                if model.get_initializer(node.input[0]) is not None:
+                    warnings.warn("Static input detected, skipping InferSplitLayer()")
+                    continue
+                # skip conversion if inputs are not integers
+                if not model.get_tensor_datatype(node.input[0]).is_integer():
+                    warnings.warn("Non-integer input detected, skipping InferSplitLayer()")
+                    continue
+                # ready for conversion
+                channels_per_stream = [model.get_tensor_shape(x)[-1] for x in node.output]
+                inp_vec = list(model.get_tensor_shape(node.input[0])[:-1])
+                new_node = helper.make_node(
+                    "StreamingSplit",
+                    node.input,
+                    node.output,
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    name="StreamingSplit_" + node.name,
+                    SIMD=1,
+                    ChannelsPerStream=channels_per_stream,
+                    inputDataType=model.get_tensor_datatype(node.input[0]).name,
+                    numInputVectors=inp_vec,
+                    outFIFODepths=[2] * len(node.output),
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferStreamingEltwise(Transformation):
     """Convert eltwise Add, Sub or Sub -> Abs to StreamingEltwise layer
     with AddEltwise, SubEltwise or AbsDiffEltwise op."""

From 6960e1505d2c220c7363488852fb82157282f4e0 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 16 Sep 2024 17:17:07 +0000
Subject: [PATCH 09/10] [Feature] fpgadataflow test for split added

---
 tests/fpgadataflow/test_fpgadataflow_split.py | 150 ++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_split.py

diff --git a/tests/fpgadataflow/test_fpgadataflow_split.py b/tests/fpgadataflow/test_fpgadataflow_split.py
new file mode 100644
index 0000000000..5859b6d5a6
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_split.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2021, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx
+from onnx import helper as oh
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hw_layers import InferSplitLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+
+
+def make_split_model(IN_SHAPE, IN_DTYPE, SPLIT, AXIS):
+    out_shapes = [IN_SHAPE[:-1] + [s] for s in SPLIT]
+    outputs = []
+    for i in range(len(SPLIT)):
+        name = "global_out_" + str(i)
+        out = oh.make_tensor_value_info(name, onnx.TensorProto.FLOAT, out_shapes[i])
+        outputs.append(out)
+
+    inp = oh.make_tensor_value_info("global_in", onnx.TensorProto.FLOAT, IN_SHAPE)
+    split_init = onnx.numpy_helper.from_array(
+        np.array(SPLIT, dtype=np.int64), name="Split_0_param0"
+    )
+    split_node = oh.make_node(
+        "Split", [inp.name, split_init.name], [out.name for out in outputs], axis=AXIS
+    )
+    graph = oh.make_graph(nodes=[split_node], name="split_test", inputs=[inp], outputs=outputs)
+    model = oh.make_model(graph)
+    model = ModelWrapper(model)
+    for out in outputs:
+        model.set_tensor_datatype(out.name, IN_DTYPE)
+        model.set_tensor_layout(out.name, ["N", "H", "W", "C"])
+    model.set_tensor_datatype(inp.name, IN_DTYPE)
+    model.set_tensor_layout(inp.name, ["N", "H", "W", "C"])
+    model.set_initializer(split_init.name, np.array(SPLIT, dtype=np.int64))
+    model = model.transform(GiveUniqueNodeNames())
+
+    return model
+
+
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_rtlsim"])
+@pytest.mark.parametrize("idt", [DataType["INT3"]])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_split(exec_mode, idt):
+    fpga_part = "xc7z020clg400-1"
+    clk_ns = 10
+    i_shape = [1, 5, 5, 10]
+    split = [2, 2, 6]
+    split_axis = 3
+    model = make_split_model(i_shape, idt, split, split_axis)
+    assert len(model.graph.output) == len(split)
+    exp_oshapes = []
+    for s in split:
+        oshape = i_shape.copy()
+        oshape[split_axis] = s
+        exp_oshapes.append(oshape)
+    onames = [o.name for o in model.graph.output]
+    assert all(model.get_tensor_shape(oname) == exp_oshapes[i] for i, oname in enumerate(onames))
+
+    inputs = []
+    for out_shape in exp_oshapes:
+        inputs.append(np.random.randint(idt.min(), idt.max() + 1, out_shape).astype(np.float32))
+    test_input = np.concatenate(inputs, axis=split_axis)
+    input_dict = {model.graph.input[0].name: test_input}
+    ret = execute_onnx(model, input_dict)
+    for i, (k, v) in enumerate(ret.items()):
+        assert (v == inputs[i]).all()
+
+    # call transformation to convert to HW and verify conversion
+    model = model.transform(InferSplitLayer())
+    assert model.graph.node[0].op_type == "StreamingSplit"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    ret = execute_onnx(model, input_dict)
+    for i, (k, v) in enumerate(ret.items()):
+        assert (v == inputs[i]).all()
+
+    model = model.transform(SpecializeLayers(fpga_part))
+    assert model.graph.node[0].op_type == "StreamingSplit_hls"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls"
+    if exec_mode == "cppsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(fpga_part, clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    elif exec_mode == "stitched_rtlsim":
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
+        model = model.transform(SpecializeLayers(fpga_part))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(fpga_part, clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(
+            CreateStitchedIP(
+                fpga_part,
+                clk_ns,
+                vitis=False,
+            )
+        )
+        model.set_metadata_prop("exec_mode", "rtlsim")
+        model.set_metadata_prop("rtlsim_trace", "trace.vcd")
+    ret_sim = execute_onnx(model, input_dict)
+    for i, (k, v) in enumerate(ret_sim.items()):
+        assert (v == inputs[i]).all()

From c8c8d49cef0c9374ccca4337bc60701fae3ef450 Mon Sep 17 00:00:00 2001
From: Michal Danilowicz <mdaniowi@amd.com>
Date: Mon, 23 Sep 2024 13:34:20 +0000
Subject: [PATCH 10/10] [Update] Finn-hlslib commit updated

---
 fetch-repos.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fetch-repos.sh b/fetch-repos.sh
index a4fc124fa4..078eb33ec0 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+HLSLIB_COMMIT="2c066e87f5b8d309693c5d46c206473ca20ac68c"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"