diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 5d69802337..62814c487f 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -40,11 +40,16 @@
 
 class AutoFIFOSizingMethod(str, Enum):
     "Select the type of automatic FIFO sizing strategy."
-
     CHARACTERIZE = "characterize"
     LARGEFIFO_RTLSIM = "largefifo_rtlsim"
 
 
+class FIFOCharacterizationMethod(str, Enum):
+    "Select the strategy for characteristic sizing of FIFOs."
+    CHARACTERIZE_RTLSIM = "rtlsim"
+    CHARACTERIZE_ANALYTICAL = "analytical"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -116,9 +121,9 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_set_fifo_depths",
     "step_hw_codegen",
     "step_hw_ipgen",
-    "step_set_fifo_depths",
     "step_create_stitched_ip",
     "step_measure_rtlsim_performance",
     "step_out_of_context_synthesis",
@@ -273,6 +278,15 @@ class DataflowBuildConfig:
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Which strategy will be used for characteristic function-based FIFO sizing.
+    #: CHARACTERIZE_RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the characteristic functions empirically
+    #: CHARACTERIZE_ANALYTICAL will use analytical functions if available, avoiding the generation
+    #: of IP cores.
+    characteristic_function_strategy: Optional[
+        FIFOCharacterizationMethod
+    ] = FIFOCharacterizationMethod.CHARACTERIZE_RTLSIM
+
     #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
     #: if set to True, always using Python instead
     force_python_rtlsim: Optional[bool] = False
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index ab2280554c..220280031b 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -555,14 +555,18 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(InsertDWC())
             model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(AnnotateCycles())
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10)
             model = model.transform(
-                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+                DeriveCharacteristic(
+                    model,
+                    period,
+                    cfg.characteristic_function_strategy,
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                )
             )
-            model = model.transform(HLSSynthIP())
-            model = model.transform(PrepareRTLSim())
-            model = model.transform(AnnotateCycles())
-            period = model.analysis(dataflow_performance)["max_cycles"] + 10
-            model = model.transform(DeriveCharacteristic(period))
             model = model.transform(DeriveFIFOSizes())
             model = model.transform(
                 InsertFIFO(
@@ -625,6 +629,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         "depth_trigger_uram",
         "depth_trigger_bram",
     ]
+
     extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
 
     # perform FIFO splitting and shallow FIFO removal only after the final config
@@ -636,8 +641,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
-    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
-    model = model.transform(HLSSynthIP())
+    # model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+    # model = model.transform(HLSSynthIP())
     return model
 
 
diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py
index ac61786ac1..4af2b64197 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams.py
@@ -159,7 +159,9 @@ def get_verilog_top_module_intf_names(self):
         intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
         return intf_names
 
-    def derive_characteristic_fxns(self, period):
+    def derive_characteristic_fxns(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
@@ -168,4 +170,6 @@ def derive_characteristic_fxns(self, period):
             },
             "outputs": {"out": []},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+        super().derive_characteristic_fxns(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py
index 9bf4ebdf62..1f17ddc851 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py
@@ -232,3 +232,41 @@ def execute_node(self, context, graph):
         sess = rt.InferenceSession(model_func.SerializeToString())
         result = sess.run(None, idict)
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        PE = self.get_nodeattr("PE")
+        NumChannels = self.get_nodeattr("NumChannels")
+        NF = int(NumChannels / PE)
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+        # assert True == False
+        kwargs = (NF, dim)
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (NF, dim) = kwargs
+
+        for k in range(dim):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        #
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (NF, dim) = kwargs
+
+        for k in range(dim):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 1fb4940fb4..c00603f375 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -277,3 +277,243 @@ def execute_node(self, context, graph):
         # this automatically updates the execution context
         inst = getCustomOp(im2col_node)
         inst.execute_node(context, model_im2col.graph)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        IFMDim_x = self.get_nodeattr("IFMDim")[0]
+        OFMDim_x = self.get_nodeattr("OFMDim")[0]
+        ConvKernelDim_x = self.get_nodeattr("ConvKernelDim")[0]
+        Stride_x = self.get_nodeattr("Stride")[0]
+
+        OFMDim_y = self.get_nodeattr("OFMDim")[1]
+        ConvKernelDim_y = self.get_nodeattr("ConvKernelDim")[1]
+        Stride_y = self.get_nodeattr("Stride")[1]
+
+        SIMD = self.get_nodeattr("SIMD")
+
+        IFMChannels = self.get_nodeattr("IFMChannels")
+
+        DEPTHWISE = self.get_nodeattr("depthwise")
+        is1d = self.get_nodeattr("is1D")
+        # m = self.get_nodeattr("m")
+        # flip = self.get_nodeattr("flip")
+
+        SIMD_COUNT = int(IFMChannels / SIMD)
+        OUTPUT_SIZE = OFMDim_x * ConvKernelDim_x * SIMD_COUNT
+        INPUT_SIZE = IFMDim_x * SIMD_COUNT
+        WINDOW_SIZE = ConvKernelDim_x * SIMD_COUNT
+        if DEPTHWISE:
+            BUFFER_SIZE = ConvKernelDim_x * SIMD_COUNT
+            READ_CYCLES = SIMD_COUNT * (ConvKernelDim_x - 1) - (ConvKernelDim_x - 1)
+            FINISH = IFMDim_x - ConvKernelDim_x - 2
+        else:
+            BUFFER_SIZE = (ConvKernelDim_x - 1) * SIMD_COUNT
+            READ_CYCLES = 0
+            FINISH = 0
+
+        OCNT_INITIAL = BUFFER_SIZE + (Stride_x - 1)
+
+        DEFAULT_FIFO_DEPTH = 2
+
+        multiplying_factor = int(IFMChannels / SIMD)
+        number_blocks = int(ConvKernelDim_y / Stride_y + 1)
+        cycles_write_block = OFMDim_x * ConvKernelDim_x * ConvKernelDim_y * multiplying_factor
+        cycles_read_block = Stride_x * IFMDim_x * multiplying_factor
+        max_cycles = max(cycles_write_block, cycles_read_block)
+        baseIter = IFMDim_x * ConvKernelDim_y * multiplying_factor + OFMDim_y * max(
+            cycles_write_block, cycles_read_block
+        )
+        initial_buffer = IFMDim_x * ConvKernelDim_y * multiplying_factor
+
+        READ_DELAY = (
+            number_blocks
+            * ConvKernelDim_x
+            * ConvKernelDim_y
+            * OFMDim_x
+            * OFMDim_y
+            * multiplying_factor
+            - ConvKernelDim_x * ConvKernelDim_y * OFMDim_x
+        )
+        READ_ITES = int((baseIter - OFMDim_y) / max(cycles_write_block, cycles_read_block))
+
+        # assert True == False
+        kwargs = (
+            SIMD_COUNT,
+            Stride_x,
+            Stride_y,
+            OUTPUT_SIZE,
+            INPUT_SIZE,
+            WINDOW_SIZE,
+            BUFFER_SIZE,
+            READ_CYCLES,
+            OCNT_INITIAL,
+            DEPTHWISE,
+            DEFAULT_FIFO_DEPTH,
+            is1d,
+            multiplying_factor,
+            number_blocks,
+            cycles_write_block,
+            cycles_read_block,
+            max_cycles,
+            baseIter,
+            initial_buffer,
+            FINISH,
+            OFMDim_y,
+            READ_DELAY,
+            READ_ITES,
+        )
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (
+            SIMD_COUNT,
+            Stride_x,
+            Stride_y,
+            OUTPUT_SIZE,
+            INPUT_SIZE,
+            WINDOW_SIZE,
+            BUFFER_SIZE,
+            READ_CYCLES,
+            OCNT_INITIAL,
+            DEPTHWISE,
+            DEFAULT_FIFO_DEPTH,
+            is1d,
+            multiplying_factor,
+            number_blocks,
+            cycles_write_block,
+            cycles_read_block,
+            max_cycles,
+            baseIter,
+            initial_buffer,
+            FINISH,
+            OFMDim_y,
+            READ_DELAY,
+            READ_ITES,
+        ) = kwargs
+
+        if DEPTHWISE:
+            OCNT_MAX = BUFFER_SIZE
+            ocnt = SIMD_COUNT
+
+        else:
+            OCNT_MAX = WINDOW_SIZE
+            if OCNT_INITIAL < WINDOW_SIZE:
+                ocnt = OCNT_INITIAL
+            else:
+                ocnt = -1
+
+        # fifo filling
+        for i in range(0, DEFAULT_FIFO_DEPTH):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        # main function
+
+        inp_count = 0
+
+        if is1d:
+            for i in range(0, OUTPUT_SIZE):
+                txns.append(counter)
+                we = (i < OCNT_MAX) or (ocnt < (SIMD_COUNT * Stride_x))
+                re = i > 0
+
+                if re:
+                    ocnt += 1
+                    if ocnt == OCNT_MAX:
+                        ocnt = 0
+                if we:
+                    if inp_count < INPUT_SIZE - DEFAULT_FIFO_DEPTH:
+                        counter += 1
+                        inp_count += 1
+
+                cycles += 1
+        else:
+            for i in range(0, initial_buffer + cycles_read_block - 1):
+                txns.append(counter)
+                cycles += 1
+                counter += 1
+
+            txns.append(counter)
+            cycles += 1  # one  extra for loop tail
+
+            for i in range(0, OFMDim_y - 1):
+                for j in range(0, cycles_write_block - cycles_read_block):
+                    txns.append(counter)
+                    cycles += 1
+
+                for j in range(0, cycles_read_block - 1):
+                    if i < OFMDim_y - 2:
+                        counter += 1
+                        txns.append(counter)
+                        cycles += 1
+                #   else:
+                #   if j < FINISH:
+                #        counter+=1
+                #        txns.append(counter)
+                #       cycles+=1
+        #
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (
+            SIMD_COUNT,
+            Stride_x,
+            Stride_y,
+            OUTPUT_SIZE,
+            INPUT_SIZE,
+            WINDOW_SIZE,
+            BUFFER_SIZE,
+            READ_CYCLES,
+            OCNT_INITIAL,
+            DEPTHWISE,
+            DEFAULT_FIFO_DEPTH,
+            is1d,
+            multiplying_factor,
+            number_blocks,
+            cycles_write_block,
+            cycles_read_block,
+            max_cycles,
+            baseIter,
+            initial_buffer,
+            FINISH,
+            OFMDim_y,
+            READ_DELAY,
+            READ_ITES,
+        ) = kwargs
+
+        # HYPER PARAMETERS
+
+        INITIAL_LOOP_CYCLES = 5
+
+        if is1d:
+            for i in range(0, INITIAL_LOOP_CYCLES):
+                txns.append(counter)
+                cycles += 1
+
+            for i in range(0, READ_CYCLES):
+                txns.append(counter)
+                cycles += 1
+
+            for i in range(0, OUTPUT_SIZE):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+        else:
+            for i in range(0, initial_buffer + INITIAL_LOOP_CYCLES - 1):
+                txns.append(counter)
+                cycles += 1
+
+            for i in range(0, baseIter - initial_buffer):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
index 8943ffc9e3..ac59868f27 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
@@ -40,20 +40,25 @@ def __init__(self, onnx_node, **kwargs):
         super().__init__(onnx_node, **kwargs)
 
     def get_nodeattr_types(self):
-        my_attrs = {
-            "NumChannels": ("i", True, 0),
-            "PE": ("i", True, 0),
-            # how many duplicated output streams to create
-            "NumOutputStreams": ("i", True, 0),
-            # FINN DataTypes for input
-            "inputDataType": ("s", True, ""),
-            # number of input vectors, examples:
-            # [1] is a single vector (like a FC layer with batch=1)
-            # [4] is four vectors (like a FC layer with batch=4)
-            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
-            "numInputVectors": ("ints", False, [1]),
-        }
-        my_attrs.update(super().get_nodeattr_types())
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumChannels": ("i", True, 0),
+                "PE": ("i", True, 0),
+                # how many duplicated output streams to create
+                "NumOutputStreams": ("i", True, 0),
+                # FINN DataTypes for input
+                "inputDataType": ("s", True, ""),
+                # number of input vectors, examples:
+                # [1] is a single vector (like a FC layer with batch=1)
+                # [4] is four vectors (like a FC layer with batch=4)
+                # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+                "numInputVectors": ("ints", False, [1]),
+                # TODO: how to set a default value depending on NumOutputStreams?
+                # transformations like set_fifo_depth expect this attribute for every i/o of every node
+                "outFIFODepths": ("ints", False, [2, 2]),
+            }
+        )
         return my_attrs
 
     def get_num_output_streams(self):
@@ -166,7 +171,9 @@ def get_verilog_top_module_intf_names(self):
             )
         return intf_names
 
-    def derive_characteristic_fxns(self, period):
+    def derive_characteristic_fxns(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
@@ -174,4 +181,7 @@ def derive_characteristic_fxns(self, period):
             },
             "outputs": {"out0": [], "out1": []},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+        super().derive_characteristic_fxns(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py
index 5767028ea7..bf1415d4ca 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding.py
@@ -170,3 +170,59 @@ def execute_node(self, context, graph):
             inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant"
         )
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        ImgDim = self.get_nodeattr("ImgDim")
+        Padding = self.get_nodeattr("Padding")
+        NewDim = [ImgDim[0] + Padding[0] + Padding[2], ImgDim[1] + Padding[1] + Padding[3]]
+        NumChannels = self.get_nodeattr("NumChannels")
+        SIMD = self.get_nodeattr("SIMD")
+        TOTAL_ELS = np.prod(NewDim)
+        NF = int(NumChannels / SIMD)
+
+        # assert True == False
+        kwargs = (ImgDim, NewDim, Padding, NumChannels, SIMD, TOTAL_ELS, NF)
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (ImgDim, NewDim, Padding, NumChannels, SIMD, TOTAL_ELS, NF) = kwargs
+
+        for y in range(0, NewDim[0]):
+            for x in range(0, NewDim[1]):
+                for k in range(NF):
+                    txns.append(counter)
+                    if (
+                        Padding[0] <= y
+                        and (y < (NewDim[0] - Padding[2]))
+                        and Padding[1] <= x
+                        and (x < (NewDim[1] - Padding[3]))
+                    ):
+                        counter += 1
+                    cycles += 1
+                if NF == 1:  # loop end delay when fully unrolled
+                    txns.append(counter)
+                    cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (ImgDim, NewDim, Padding, NumChannels, SIMD, TOTAL_ELS, NF) = kwargs
+
+        for i in range(0, TOTAL_ELS):
+            for j in range(NF):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+            if NF == 1:  # loop end delay when fully unrolled
+                txns.append(counter)
+                cycles += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
index b753bc7a03..0a4ffc3fea 100644
--- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
@@ -735,18 +735,3 @@ def ipgen_extra_directives(self):
         "Return a list of extra tcl directives for HLS synthesis."
 
         return ["config_compile -pipeline_style frp"]
-
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["internal_decoupled", "external"]:
-            n_weight_inps = self.calc_tmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index b40b8f3074..423535c859 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -34,7 +34,8 @@
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
 
-from finn.util.basic import pyverilate_get_liveness_threshold_cycles
+from finn.util.basic import make_build_dir, pyverilate_get_liveness_threshold_cycles
+from finn.util.fpgadataflow import is_hls_node
 
 try:
     from pyverilator import PyVerilator
@@ -94,8 +95,10 @@ def get_nodeattr_types(self):
             # the period for which the characterization was run
             "io_chrc_period": ("i", False, 0),
             # amount of zero padding inserted during chrc.
-            "io_chrc_pads_in": ("ints", False, []),
-            "io_chrc_pads_out": ("ints", False, []),
+            "io_chrc_pads_in": ("i", False, 0),
+            "io_chrc_pads_out": ("i", False, 0),
+            "io_chrc_in_concat": ("t", False, np.asarray([], dtype=np.int32)),
+            "io_chrc_out_concat": ("t", False, np.asarray([], dtype=np.int32)),
         }
 
     def get_verilog_top_module_name(self):
@@ -361,10 +364,162 @@ def get_outstream_width_padded(self, ind=0):
         out_width = self.get_outstream_width(ind=ind)
         return roundup_to_integer_multiple(out_width, 8)
 
-    def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
+    def derive_characteristic_fxns(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
+        if override_dict is None:
+            n_inps = np.prod(self.get_folded_input_shape()[:-1])
+            io_dict = {
+                "inputs": {
+                    "in0": [0 for i in range(n_inps)],
+                },
+                "outputs": {"out": []},
+            }
+        else:
+            io_dict = override_dict
+
+        if strategy == "analytical":
+            # check for override function
+            prepare_kwargs_for_characteristic_fx = getattr(
+                self, "prepare_kwargs_for_characteristic_fx", None
+            )
+            if callable(prepare_kwargs_for_characteristic_fx):
+                # Analytical flow
+                self.derive_characteristic_fxns_analytically(period, io_dict=io_dict)
+                return
+
+        # RTL-based flow
+        self.derive_characteristic_fxns_rtlsim(
+            model, period, fpga_part, clk_period, io_dict=io_dict
+        )
+
+    def derive_characteristic_fxns_analytically(self, period, io_dict):
+        # Analytical flow
+
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key}
+        txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key}
+
+        all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32)
+        all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
+
+        self.set_nodeattr("io_chrc_period", period)
+
+        txn_in = []
+        txn_out = []
+
+        # INPUT
+
+        counter = 0
+        padding = 0
+
+        kwargs = self.prepare_kwargs_for_characteristic_fx()
+
+        # first period
+        cycles = 0
+        txn_in, cycles, counter = self.characteristic_fx_input(txn_in, cycles, counter, kwargs)
+
+        txn_in += [counter] * (period - cycles)
+        padding += period - cycles
+
+        # second period
+        cycles = period
+        txn_in, cycles, counter = self.characteristic_fx_input(txn_in, cycles, counter, kwargs)
+
+        txn_in += [counter] * (period * 2 - cycles)
+        padding += period * 2 - cycles
+
+        # final assignments
+        all_txns_in[0, :] = np.array(txn_in[: period * 2])
+        self.set_nodeattr("io_chrc_in", all_txns_in)
+        self.set_nodeattr("io_chrc_pads_in", padding)
+
+        # OUTPUT
+
+        counter = 0
+        cycles = 0
+        padding = 0
+
+        txn_out, cycles, counter = self.characteristic_fx_output(txn_out, cycles, counter, kwargs)
+
+        txn_out += [counter] * (period - cycles)
+        padding += period - cycles
+
+        cycles = period
+
+        txn_out, cycles, counter = self.characteristic_fx_output(txn_out, cycles, counter, kwargs)
+
+        txn_out += [counter] * (period * 2 - cycles)
+        padding += period * 2 - cycles
+
+        all_txns_out[0, :] = np.array(txn_out[: period * 2])
+        self.set_nodeattr("io_chrc_out", all_txns_out)
+        self.set_nodeattr("io_chrc_pads_out", padding)
+
+    def derive_characteristic_fxns_rtlsim(self, model, period, fpga_part, clk_period, io_dict=None):
         """Return the unconstrained characteristic functions for this node."""
         # ensure rtlsim is ready
-        assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name
+        if self.get_nodeattr("rtlsim_so") == "":
+            # generate the IP for this node
+
+            # lazy construction of prepare_ip step
+            node = self.onnx_node
+            op_type = node.op_type
+            # get the path of the code generation directory
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            # ensure that there is a directory
+            if code_gen_dir == "" or not os.path.isdir(code_gen_dir):
+                code_gen_dir = make_build_dir(
+                    prefix="code_gen_ipgen_" + str(self.onnx_node.name) + "_"
+                )
+                self.set_nodeattr("code_gen_dir_ipgen", code_gen_dir)
+                # ensure that there is generated code inside the dir
+                self.code_generation_ipgen(model, fpga_part, clk_period)
+
+            # lazy construction of hlssynthip step
+            if is_hls_node(node):
+                # ensure that code is generated
+                try:
+                    assert (
+                        self.get_nodeattr("code_gen_dir_ipgen") != ""
+                    ), """Node
+                    attribute "code_gen_dir_ipgen" is empty. Please run
+                    transformation PrepareIP first."""
+                    if not os.path.isdir(self.get_nodeattr("ipgen_path")) or not self.get_nodeattr(
+                        "code_gen_dir_ipgen"
+                    ) in self.get_nodeattr("ipgen_path"):
+                        # call the compilation function for this node
+                        self.ipgen_singlenode_code()
+                    else:
+                        warnings.warn("Using pre-existing IP for %s" % self.onnx_node.name)
+                    # ensure that executable path is now set
+                    assert (
+                        self.get_nodeattr("ipgen_path") != ""
+                    ), """Transformation
+                    HLSSynthIP was not successful. Node attribute "ipgen_path"
+                    is empty."""
+                except KeyError:
+                    # exception if op_type is not supported
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
+
+                # lazy construction of prepare rtlsim step
+
+                try:
+                    self.prepare_rtlsim()
+                    # ensure that executable path is now set
+                    assert (
+                        self.get_nodeattr("rtlsim_so") != ""
+                    ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
+                except KeyError:
+                    # exception if op_type is not supported
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
+            else:
+                self.prepare_rtlsim()
+                # ensure that executable path is now set
+                assert (
+                    self.get_nodeattr("rtlsim_so") != ""
+                ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
+
+        # assert , "rtlsim not ready for " + self.onnx_node.name
         if self.get_nodeattr("io_chrc_period") > 0:
             warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name)
             return
@@ -384,15 +539,6 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
         sim = self.get_rtlsim()
         # signal name
         sname = "_" + self.hls_sname() + "_"
-        if override_rtlsim_dict is not None:
-            io_dict = override_rtlsim_dict
-        else:
-            io_dict = {
-                "inputs": {
-                    "in0": [0 for i in range(n_inps)],
-                },
-                "outputs": {"out": []},
-            }
 
         # extra dicts to keep track of cycle-by-cycle transaction behavior
         # note that we restrict key names to filter out weight streams etc
@@ -447,6 +593,8 @@ def accumulate_char_fxn(chrc):
         all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
         all_pad_in = []
         all_pad_out = []
+        pad_in = 0
+        pad_out = 0
         for in_idx, in_strm_nm in enumerate(txns_in.keys()):
             txn_in = txns_in[in_strm_nm]
             if len(txn_in) < period:
diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py
index f4b098cff7..dd88e331a2 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect.py
@@ -184,3 +184,52 @@ def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         exp_cycles = nlabels / pe
         return int(exp_cycles)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+
+        num_in_words = self.get_nodeattr("Labels")
+        PE = self.get_nodeattr("PE")
+        K = self.get_nodeattr("K")
+
+        kwargs = (num_in_words, PE, K)
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (num_in_words, PE, K) = kwargs
+
+        # input
+        for i in range(0, int(num_in_words / PE) + 1):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (num_in_words, PE, K) = kwargs
+
+        windup_clocks = 4
+        for i in range(0, windup_clocks):
+            txns.append(counter)
+            cycles += 1
+
+        # first output period, computing Labels
+        for i in range(0, int(num_in_words / PE + K)):
+            txns.append(counter)
+            cycles += 1
+
+        # output the K labels which got selected
+        for j in range(0, K):
+            txns.append(counter)
+            cycles += 1
+            counter += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 8f0a987bce..890ff1cda2 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -840,21 +840,6 @@ def get_op_and_param_counts(self):
             ret_dict[thres_param_type] = thres_count
         return ret_dict
 
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["internal_decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
-
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
@@ -973,3 +958,85 @@ def code_generation_ipi(self):
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        MW = self.get_nodeattr("MW")
+        MH = self.get_nodeattr("MH")
+
+        SIMD = self.get_nodeattr("SIMD")
+        PE = self.get_nodeattr("PE")
+        numVectors = np.prod(self.get_nodeattr("numInputVectors"))
+        BURST_SIZE = int(MW / SIMD)
+        BURST_COUNT = int(MH / PE)
+
+        kwargs = (MW, MH, SIMD, PE, BURST_COUNT, BURST_SIZE, numVectors)
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        (MW, MH, SIMD, PE, BURST_COUNT, BURST_SIZE, numVectors) = kwargs
+
+        tracker = 0
+        maximum = numVectors * BURST_SIZE
+
+        if numVectors > 1:
+            for i in range(2):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+                tracker += 1
+
+        for k in range(numVectors):
+            for j in range(BURST_SIZE):
+                if tracker < maximum:
+                    txns.append(counter)
+                    counter += 1
+                    cycles += 1
+                    tracker += 1
+
+            for i in range(BURST_COUNT - 1):
+                for j in range(BURST_SIZE):
+                    txns.append(counter)
+                    cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        (MW, MH, SIMD, PE, BURST_COUNT, BURST_SIZE, numVectors) = kwargs
+
+        windup_clocks = 3
+
+        for i in range(0, windup_clocks):
+            txns.append(counter)
+            cycles += 1
+
+        for k in range(numVectors):
+            for i in range(BURST_COUNT):
+                for j in range(BURST_SIZE):
+                    txns.append(counter)
+                    cycles += 1
+                counter += 1
+
+        return txns, cycles, counter
+
+    def derive_characteristic_fxns(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["internal_decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            # num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            # TODO: Why is num_w_reps not considered here?
+            io_dict["inputs"]["weights"] = [0 for i in range(1 * n_weight_inps)]
+
+        super().derive_characteristic_fxns(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py
index 35aee023b9..b548548013 100644
--- a/src/finn/custom_op/fpgadataflow/pool.py
+++ b/src/finn/custom_op/fpgadataflow/pool.py
@@ -222,3 +222,43 @@ def execute_node(self, context, graph):
             result = np.right_shift(result.astype(int), shift_bits)
         oshape = context[node.output[0]].shape
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        Channels = self.get_nodeattr("Channels")
+        PE = self.get_nodeattr("PE")
+        KernelSize = np.prod(self.get_nodeattr("KernelSize"))
+
+        # assert True == False
+        NF = int(Channels / PE)
+        kwargs = (NF, KernelSize)
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (NF, KernelSize) = kwargs
+
+        for i in range(0, KernelSize):
+            for k in range(NF):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+
+        #
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (NF, KernelSize) = kwargs
+
+        for i in range(0, KernelSize):
+            for k in range(NF):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
index 4921caeb00..d9f07e822f 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
@@ -214,3 +214,145 @@ def lut_estimation(self):
             cset_luts += outw
 
         return int(cnt_luts + cset_luts)
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        numInWords = int(np.prod(self.get_folded_input_shape()[-2:-1]))
+        numOutWords = int(np.prod(self.get_folded_output_shape()[-2:-1]))
+        numReps = int(np.prod(self.get_folded_input_shape()[:1]))
+
+        inWidth = self.get_nodeattr("inWidth")
+        outWidth = self.get_nodeattr("outWidth")
+
+        kwargs = (numInWords, numOutWords, inWidth, outWidth, numReps)
+
+        # assert True==False
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        (numInWords, numOutWords, inWidth, outWidth, numReps) = kwargs
+
+        # HYPER PARAMETERS WHICH MAY CHANGE OVER TIME
+        windup_clocks_up_convert_input = 4
+
+        windup_clocks_down_convert_input = 3
+
+        windup_clocks_down_convert_output = 4
+        windup_clocks_equal_convert_output = 3
+
+        if numInWords < windup_clocks_up_convert_input:
+            windup_clocks_up_convert_input = numInWords
+
+        if numInWords < windup_clocks_down_convert_input:
+            windup_clocks_down_convert_input = numInWords
+
+        if numOutWords < windup_clocks_down_convert_output:
+            windup_clocks_down_convert_output = numOutWords
+
+        if numOutWords < windup_clocks_equal_convert_output:
+            windup_clocks_equal_convert_output = numOutWords
+
+        # first input period
+        tracker = 0
+        maximum = numReps * numInWords
+
+        if numReps > 1:
+            # loop windup
+            for i in range(2):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+                tracker += 1
+
+        for j in range(0, numReps):
+            for i in range(0, numInWords):
+                if tracker < maximum:
+                    txns.append(counter)
+                    counter += 1
+                    cycles += 1
+                    tracker += 1
+            for i in range(0, 1):
+                txns.append(counter)
+                cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        (numInWords, numOutWords, inWidth, outWidth, numReps) = kwargs
+
+        # HYPER PARAMETERS WHICH MAY CHANGE
+        windup_clocks_up_convert_input = 3
+        windup_clocks_down_convert_input = 2
+
+        windup_clocks_down_convert_output = 3
+        windup_clocks_equal_convert_output = 2
+
+        if numInWords < windup_clocks_up_convert_input:
+            windup_clocks_up_convert_input = numInWords
+
+        if numInWords < windup_clocks_down_convert_input:
+            windup_clocks_down_convert_input = numInWords
+
+        if numOutWords < windup_clocks_down_convert_output:
+            windup_clocks_down_convert_output = numOutWords
+
+        if numOutWords < windup_clocks_equal_convert_output:
+            windup_clocks_equal_convert_output = numOutWords
+
+        # calculation to adjust for padding or cropping adding latency
+
+        if outWidth > inWidth:
+            higher = outWidth
+            lower = inWidth
+        else:
+            higher = inWidth
+            lower = outWidth
+
+        if higher % lower != 0:
+            if numInWords * inWidth > numOutWords * outWidth:
+                pad = False
+            else:
+                pad = True
+
+        else:
+            pad = False
+
+            # windup period
+            if inWidth == outWidth:
+                clock = windup_clocks_equal_convert_output
+            else:
+                clock = windup_clocks_up_convert_input
+            for i in range(0, clock):
+                txns.append(counter)
+                cycles += 1
+            # padding +=1
+
+            # first input period
+
+            remainder = 0
+
+            for k in range(numReps):
+                # windup
+                txns.append(counter)
+                cycles += 1
+
+                for i in range(0, numOutWords):
+                    for j in range(0, int(np.floor(outWidth / inWidth))):
+                        if j != 0:
+                            txns.append(counter)
+                            cycles += 1
+                        remainder += inWidth
+                    #  padding +=1
+
+                    if pad and remainder < outWidth:
+                        print(remainder)
+                        txns.append(counter)
+                        remainder += inWidth
+                        cycles += 1
+
+                    txns.append(counter)
+                    cycles += 1
+
+                    counter += 1
+                    remainder -= outWidth
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py
index 59a8f092d0..92c004d90a 100755
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py
@@ -234,3 +234,240 @@ def execute_node(self, context, graph):
         # convert output NCHW -> NHWC
         result = np.transpose(result, (0, 2, 3, 1))
         context[node.output[0]] = result
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+        ceil_mode = self.get_nodeattr("CeilMode")
+        output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode)
+        is1d = self.is_1d()
+
+        NumChannels = self.get_nodeattr("NumChannels")
+        PoolDim = self.get_nodeattr("PoolDim")[0]
+        ImgDim = self.get_nodeattr("ImgDim")[0]
+
+        # SIMD = self.get_nodeattr("SIMD")
+        PE = self.get_nodeattr("PE")
+
+        windup_clocks = 4
+        read_delay = 5
+
+        # for i in range(0,windup_clocks):
+        #    txn_out[cycles] = i
+        #    cycles+=1
+        #    p+=1
+
+        bursts = int(read_delay + ImgDim / PoolDim)
+        read_tail_latency = 6
+        write_tail_latency = 14
+
+        kwargs = (
+            ifm_dim,
+            output_size,
+            is1d,
+            NumChannels,
+            PoolDim,
+            ImgDim,
+            PE,
+            windup_clocks,
+            read_delay,
+            bursts,
+            read_tail_latency,
+            write_tail_latency,
+        )
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        (
+            ifm_dim,
+            output_size,
+            is1d,
+            NumChannels,
+            PoolDim,
+            ImgDim,
+            PE,
+            windup_clocks,
+            read_delay,
+            bursts,
+            read_tail_latency,
+            write_tail_latency,
+        ) = kwargs
+
+        if ImgDim > PoolDim * output_size:
+            REMAINDER_PIXELS = ImgDim - output_size * PoolDim
+        else:
+            REMAINDER_PIXELS = 0
+
+        tracker = 0
+        maximum = int(ImgDim / PoolDim * PoolDim * ImgDim / PoolDim * PoolDim)
+        input_count = 0
+
+        if not is1d:
+            # if i == 0:
+            for z in range(0, 2):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+                tracker += 1
+
+            if int(ImgDim / PoolDim) > 2:
+                txns.append(counter)
+                cycles += 1
+
+            for j in range(0, int(ImgDim / PoolDim)):
+                for k in range(0, int(PoolDim)):
+                    for z in range(0, int(ImgDim / PoolDim)):
+                        # actual read loop
+                        for x in range(0, PoolDim):
+                            if tracker < maximum:
+                                txns.append(counter)
+                                counter += 1
+                                cycles += 1
+                                tracker += 1
+
+                for k in range(0, int(PoolDim)):
+                    # read loop tail end
+                    for z in range(0, read_tail_latency):
+                        txns.append(counter)
+                        cycles += 1
+
+                # write delay
+                for z in range(0, int(ImgDim / PoolDim)):
+                    txns.append(counter)
+                    cycles += 1
+
+                # for k in range(0, int(PoolDim)):
+                # read loop tail end
+                for z in range(0, read_tail_latency - 2):
+                    txns.append(counter)
+                    cycles += 1
+
+        else:
+            # 1d case
+
+            # initial buffer space
+            # for k in range(int(NumChannels / PE)):
+            #    txns.append(counter)
+            #    cycles += 1
+
+            for i in range(output_size):
+                for z in range(0, PoolDim):
+                    if input_count < ImgDim:
+                        for k in range(int(NumChannels / PE)):
+                            txns.append(counter)
+                            counter += 1
+                            cycles += 1
+                    input_count += 1
+                    txns.append(counter)
+                    cycles += 1
+
+                # read loop tail end
+                # for z in range(0, read_tail_latency):
+                #     txns.append(counter)
+                #     cycles += 1
+
+                for k in range(int(NumChannels / PE)):
+                    txns.append(counter)
+                    cycles += 1
+
+                # read loop tail end
+                for z in range(0, write_tail_latency):
+                    txns.append(counter)
+                    cycles += 1
+
+            for k in range(int(REMAINDER_PIXELS * NumChannels / PE)):
+                txns.append(counter)
+                counter += 1
+                cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        (
+            ifm_dim,
+            output_size,
+            is1d,
+            NumChannels,
+            PoolDim,
+            ImgDim,
+            PE,
+            windup_clocks,
+            read_delay,
+            bursts,
+            read_tail_latency,
+            write_tail_latency,
+        ) = kwargs
+
+        txns.append(counter)
+        cycles += 1
+        tracker = 0
+        maximum = int(ImgDim / PoolDim * PoolDim * ImgDim / PoolDim * PoolDim)
+
+        if not is1d:
+            # if i == 0:
+            for z in range(0, 2):
+                txns.append(counter)
+                # counter += 1
+                cycles += 1
+                tracker += 1
+
+            if int(ImgDim / PoolDim) > 2:
+                txns.append(counter)
+                cycles += 1
+
+            for j in range(0, int(ImgDim / PoolDim)):
+                for k in range(0, int(PoolDim)):
+                    for z in range(0, int(ImgDim / PoolDim)):
+                        # actual read loop
+                        for x in range(0, PoolDim):
+                            if tracker < maximum:
+                                txns.append(counter)
+                                cycles += 1
+                                tracker += 1
+
+                for k in range(0, int(PoolDim)):
+                    # read loop tail end
+                    for z in range(0, read_tail_latency):
+                        txns.append(counter)
+                        cycles += 1
+
+                # write delay
+                for z in range(0, int(ImgDim / PoolDim)):
+                    txns.append(counter)
+                    counter += 1
+                    cycles += 1
+
+                # for k in range(0, int(PoolDim)):
+                # read loop tail end
+                for z in range(0, read_tail_latency - 2):
+                    txns.append(counter)
+                    cycles += 1
+
+        else:
+            # 1d case
+            # initial buffer space
+            # for k in range(int(NumChannels / PE)):
+            #    txns.append(counter)
+            #     cycles += 1
+
+            for i in range(output_size):
+                for z in range(0, PoolDim):
+                    for k in range(int(NumChannels / PE)):
+                        txns.append(counter)
+                        cycles += 1
+
+                for z in range(0, read_tail_latency):
+                    txns.append(counter)
+                    cycles += 1
+
+                for k in range(int(NumChannels / PE)):
+                    txns.append(counter)
+                    counter += 1
+                    cycles += 1
+
+                # for z in range(0,PoolDim):
+                #    for k in range(0,read_tail_latency):
+                #        txns.append(counter)
+                #        cycles+=1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index 12cb76be4e..e9b0b17d73 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -264,3 +264,40 @@ def calc_tmem(self):
         num_channels = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         return num_channels // pe
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        NumChannels = self.get_nodeattr("NumChannels")
+        PE = self.get_nodeattr("PE")
+        reps = 1
+        ImgDim = int(np.prod(list(self.get_nodeattr("numInputVectors"))))
+        NF = int(NumChannels / PE)
+
+        TOTAL_ITERATIONS = reps * ImgDim * NF
+
+        kwargs = (TOTAL_ITERATIONS, NumChannels, PE, reps, ImgDim, NF)
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        (TOTAL_ITERATIONS, NumChannels, PE, reps, ImgDim, NF) = kwargs
+        for i in range(0, TOTAL_ITERATIONS):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        (TOTAL_ITERATIONS, NumChannels, PE, reps, ImgDim, NF) = kwargs
+
+        windup = 6
+        for i in range(0, windup):
+            txns.append(counter)
+            cycles += 1
+        # first input period
+        for i in range(0, TOTAL_ITERATIONS):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        return txns, cycles, counter
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index d95c6eb7cc..7f2a1bbfa5 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -789,21 +789,6 @@ def get_op_and_param_counts(self):
             ret_dict[thres_param_type] = thres_count
         return ret_dict
 
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["internal_decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
-
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
@@ -922,3 +907,90 @@ def code_generation_ipi(self):
         else:
             raise Exception("Unrecognized mem_mode for VectorVectorActivation")
         return cmd
+
+    def prepare_kwargs_for_characteristic_fx(self):
+        # key parameters
+        if "hls" in self.onnx_node.name:
+            impl_style = "hls"
+        else:
+            impl_style = "rtl"
+
+        SIMD = self.get_nodeattr("SIMD")
+        PE = self.get_nodeattr("PE")
+        Channels = self.get_nodeattr("Channels")
+        Kernel_2 = np.prod(self.get_nodeattr("Kernel"))
+        NF = int(Channels / PE)
+        SF = Kernel_2
+        numReps = np.prod(self.get_nodeattr("Dim"))
+        TOTAL_FOLD = NF * SF * numReps
+
+        if impl_style == "rtl":
+            TOTAL_FOLD = int(TOTAL_FOLD / SIMD)
+
+        kwargs = (NF, SF, SIMD, TOTAL_FOLD, impl_style)
+
+        # assert True==False
+
+        return kwargs
+
+    def characteristic_fx_input(self, txns, cycles, counter, kwargs):
+        # Compute one period of the input characteristic function
+
+        (NF, SF, SIMD, TOTAL_FOLD, impl_style) = kwargs
+
+        # input
+        for i in range(0, TOTAL_FOLD):
+            txns.append(counter)
+            counter += 1
+            cycles += 1
+
+        return txns, cycles, counter
+
+    def characteristic_fx_output(self, txns, cycles, counter, kwargs):
+        # Compute one period of the output characteristic function
+
+        (NF, SF, SIMD, TOTAL_FOLD, impl_style) = kwargs
+        sf = 0
+        if impl_style == "hls":
+            windup = 5
+        else:
+            windup = 7
+
+        for i in range(0, windup):
+            txns.append(counter)
+            cycles += 1
+
+        # first input period
+        # txn_in[0:bursts] = np.arange(0,bursts)
+        for i in range(0, TOTAL_FOLD + 1):
+            if sf == SF:
+                counter += 1
+                sf = 0
+            sf += 1
+            # txn_in[cycles] = p_in
+            txns.append(counter)
+            cycles += 1
+        # p = bursts
+
+        return txns, cycles, counter
+
+    def derive_characteristic_fxns(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [0 for i in range(n_inps)],
+            },
+            "outputs": {"out": []},
+        }
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["internal_decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            # num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["weights"] = [0 for i in range(1 * n_weight_inps)]
+
+        super().derive_characteristic_fxns(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
index 4d3ac7dc67..4c19db4d8b 100644
--- a/src/finn/transformation/fpgadataflow/derive_characteristic.py
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -52,10 +52,15 @@ class DeriveCharacteristic(NodeLocalTransformation):
       NodeLocalTransformation for more details.
     """
 
-    def __init__(self, period, num_workers=None, manual_bypass=False):
+    def __init__(
+        self, model, period, strategy, fpga_part, clk_period, num_workers=None
+    ):
         super().__init__(num_workers=num_workers)
+        self.model = model
         self.period = period
-        self.manual_bypass = manual_bypass
+        self.strategy = strategy
+        self.fpga_part = fpga_part
+        self.clk_period = clk_period
 
     def applyNodeLocal(self, node):
         op_type = node.op_type
@@ -63,7 +68,15 @@ def applyNodeLocal(self, node):
             try:
                 # lookup op_type in registry of CustomOps
                 inst = registry.getCustomOp(node)
-                inst.derive_characteristic_fxns(period=self.period)
+
+                inst.derive_characteristic_fxns(
+                    model=self.model,
+                    period=self.period,
+                    strategy=self.strategy,
+                    fpga_part=self.fpga_part,
+                    clk_period=self.clk_period,
+                    op_type=op_type,
+                )
             except KeyError:
                 # exception if op_type is not supported
                 raise Exception("Custom op_type %s is currently not supported." % op_type)
@@ -71,47 +84,6 @@ def applyNodeLocal(self, node):
 
     def apply(self, model: ModelWrapper):
         (model, run_again) = super().apply(model)
-        if not self.manual_bypass:
-            return (model, run_again)
-        # apply manual fix for DuplicateStreams and AddStreams for
-        # simple residual reconvergent paths with bypass
-        addstrm_nodes = model.get_nodes_by_op_type("AddStreams_hls")
-        for addstrm_node in addstrm_nodes:
-            # we currently only support the case where one branch is
-            # a bypass
-            b0 = model.find_producer(addstrm_node.input[0])
-            b1 = model.find_producer(addstrm_node.input[1])
-            if (b0 is None) or (b1 is None):
-                warnings.warn("Found unsupported AddStreams, skipping")
-                return (model, run_again)
-            b0_is_bypass = b0.op_type == "DuplicateStreams_hls"
-            b1_is_bypass = b1.op_type == "DuplicateStreams_hls"
-            if (not b0_is_bypass) and (not b1_is_bypass):
-                warnings.warn("Found unsupported AddStreams, skipping")
-                return (model, run_again)
-            ds_node = b0 if b0_is_bypass else b1
-            comp_branch_last = b1 if b0_is_bypass else b0
-
-            ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1]
-            comp_branch_first = model.find_consumer(ds_comp_bout)
-            if comp_branch_first is None or comp_branch_last is None:
-                warnings.warn("Found unsupported DuplicateStreams, skipping")
-                return (model, run_again)
-            comp_branch_last = registry.getCustomOp(comp_branch_last)
-            comp_branch_first = registry.getCustomOp(comp_branch_first)
-            # for DuplicateStreams, use comp_branch_first's input characterization
-            # for AddStreams, use comp_branch_last's output characterization
-            period = comp_branch_first.get_nodeattr("io_chrc_period")
-            comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[: 2 * period]
-            comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[2 * period :]
-            ds_node_inst = registry.getCustomOp(ds_node)
-            addstrm_node_inst = registry.getCustomOp(addstrm_node)
-            ds_node_inst.set_nodeattr("io_chrc_period", period)
-            ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2)
-            addstrm_node_inst.set_nodeattr("io_chrc_period", period)
-            addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2)
-            warnings.warn(f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}")
-            warnings.warn(f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}")
         return (model, run_again)
 
 
diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
index d4cc6dcc99..523cb020e4 100644
--- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py
+++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py
@@ -46,6 +46,7 @@ def _codegen_single_node(node, model):
     try:
         # lookup op_type in registry of CustomOps
         inst = registry.getCustomOp(node)
+
         # get the path of the code generation directory
         code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim")
         # ensure that there is a directory
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 2115e058a8..ea402d1c89 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -39,11 +39,28 @@
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
 
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+    MinimizeWeightBitWidth,
+)
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
-from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
+from finn.util.basic import (
+    alveo_default_platform,
+    alveo_part_map,
+    make_build_dir,
+    pynq_part_map,
+)
 
 # map of (wbits,abits) -> model
 example_map = {
@@ -184,3 +201,58 @@ def resize_smaller_side(target_pixels, img):
 def crop_center(size, img):
     """Crop central size*size window out of a PIL image."""
     return torchvision_util.center_crop(img, size)
+
+
+def compare_two_chr_funcs(a, b, relaxation):
+    # relaxation determines how much leeway we allow for the
+    # analytical implementation to be off from RTL ground truth
+    equal = True
+    for inp in range(len(a)):
+        for i in range(len(a[inp])):
+            if (a[inp][i] > (b[inp][i] + relaxation)) or (a[inp][i] < (b[inp][i] - relaxation)):
+                equal = False
+    return equal
+
+
+def get_characteristic_fnc(model, node, part, target_clk_ns, strategy):
+    # If set to True: attempt to cache a pre-existing variant of the model
+    # this is to avoid generating RTL multiple times during
+    # test debugging
+    caching = False
+    model_cache = None
+
+    if strategy == "rtlsim" and caching:
+        build_dir = os.environ["FINN_BUILD_DIR"]
+        for x in os.listdir(build_dir):
+            if x.startswith(str(node)):
+                model_cache = f"{build_dir}/{x}/model.onnx"
+
+        make_build_dir("build_fifosizing")
+        if model_cache is not None:
+            model = ModelWrapper(model_cache)
+
+    if model_cache is None:
+        model = model.transform(SpecializeLayers(part))
+        model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(GiveUniqueNodeNames())
+        if strategy == "rtlsim":
+            model = model.transform(PrepareIP(part, target_clk_ns))
+        model = model.transform(AnnotateCycles())
+
+        period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10)
+
+        model = model.transform(
+            DeriveCharacteristic(
+                model,
+                period,
+                strategy,
+                part,
+                target_clk_ns,
+            )
+        )
+        if caching:
+            tmp_caching_output_dir = make_build_dir(str(node))
+            model.save(tmp_caching_output_dir + "/model.onnx")
+
+    return getCustomOp(model.graph.node[0])
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 842d099f57..a7a73a5ed4 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -45,8 +45,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_brevitas_fc_")
-
 
 @pytest.mark.brevitas_export
 # act bits
@@ -61,6 +59,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_brevitas_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     ishape = (1, 1, 28, 28)
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 338204c0c7..31ebe96b33 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -30,17 +30,246 @@
 import pytest
 
 import json
+import numpy as np
+import os
 import shutil
 import torch
+import copy
 from brevitas.export import export_qonnx
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
 from qonnx.custom_op.registry import getCustomOp
-
+from qonnx.transformation.general import (
+    GiveRandomTensorNames,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import make_build_dir
 from finn.util.test import get_trained_network_and_ishape
 
+def generate_random_threshold_values(
+    data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
+):
+    if per_tensor:
+        num_input_channels = 1
+    if narrow:
+        num_steps -= 1
+
+    return np.random.randint(
+        data_type.min(),
+        data_type.max() + 1,
+        (num_input_channels, num_steps),
+    ).astype(np.float32)
+
+
+def sort_thresholds_increasing(thresholds):
+    return np.sort(thresholds, axis=1)
+
+def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0):
+    # hardcoded parameters
+    idt = DataType["UINT4"]
+    wdt = DataType["UINT4"]
+    odt = DataType["UINT4"]
+    tdt = DataType["UINT32"]
+    stride = 1
+    in_ch = out_ch = ch  # input channel = output channel for stacking
+    # pad so that input dim = output dim for stacking (only supports odd kernel_size for now)
+    pad = int(np.floor(kernel_size / 2))
+
+    total_pad = 2 * pad
+    out_feature_dim = compute_conv_output_dim(ifm_dim, kernel_size, stride, total_pad)
+    weights_shape = [in_ch * kernel_size * kernel_size, out_ch]
+    thresholds_shape = [1, odt.get_num_possible_values() - 1]
+    input_shape = [1, ifm_dim, ifm_dim, in_ch]
+    padding_out_shape = [1, ifm_dim + total_pad, ifm_dim + total_pad, in_ch]
+    inpgen_out_shape = [1, out_feature_dim, out_feature_dim, in_ch * kernel_size * kernel_size]
+    output_shape = [1, out_feature_dim, out_feature_dim, out_ch]
+
+    assert input_shape == output_shape, "ERROR: Conv layer dimensions not stackable"
+
+    padding_config = {}
+    padding_config["domain"] = "finn.custom_op.fpgadataflow.rtl"
+    padding_config["backend"] = "fpgadataflow"
+    padding_config["ImgDim"] = [ifm_dim, ifm_dim]
+    padding_config["NumChannels"] = in_ch
+    padding_config["SIMD"] = simd
+    padding_config["Padding"] = [pad, pad, pad, pad]
+    padding_config["inputDataType"] = idt.name
+
+    inpgen_config = {}
+    inpgen_config["domain"] = "finn.custom_op.fpgadataflow.rtl"
+    inpgen_config["backend"] = "fpgadataflow"
+    inpgen_config["ConvKernelDim"] = [kernel_size, kernel_size]
+    inpgen_config["IFMChannels"] = in_ch
+    inpgen_config["IFMDim"] = [ifm_dim + total_pad, ifm_dim + total_pad]
+    inpgen_config["OFMDim"] = [ifm_dim, ifm_dim]
+    inpgen_config["inputDataType"] = idt.name
+    inpgen_config["outputDataType"] = idt.name
+    inpgen_config["SIMD"] = simd
+    inpgen_config["parallel_window"] = parallel_window
+    inpgen_config["Stride"] = [stride, stride]
+    inpgen_config["Dilation"] = [1, 1]
+
+    mvau_config = {}
+    mvau_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    mvau_config["backend"] = "fpgadataflow"
+    mvau_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    mvau_config["MW"] = in_ch * kernel_size * kernel_size
+    mvau_config["MH"] = in_ch
+    mvau_config["SIMD"] = simd if parallel_window == 0 else simd * kernel_size * kernel_size
+    mvau_config["PE"] = pe
+    mvau_config["resType"] = "lut"
+    mvau_config["mem_mode"] = "internal_embedded"  # internal_decoupled
+    mvau_config["inputDataType"] = idt.name
+    mvau_config["weightDataType"] = wdt.name
+    mvau_config["outputDataType"] = odt.name
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+    value_info = [
+        helper.make_tensor_value_info("weights", TensorProto.FLOAT, weights_shape),
+        helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, thresholds_shape),
+        helper.make_tensor_value_info("padding_out", TensorProto.FLOAT, padding_out_shape),
+        helper.make_tensor_value_info("inpgen_out", TensorProto.FLOAT, inpgen_out_shape),
+    ]
+
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="building_block",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("FMPadding_rtl", ["top_in"], ["padding_out"], **padding_config),
+                helper.make_node(
+                    "ConvolutionInputGenerator_rtl",
+                    ["padding_out"],
+                    ["inpgen_out"],
+                    **inpgen_config,
+                ),
+                helper.make_node(
+                    "MVAU_hls", ["inpgen_out", "weights", "thresholds"], ["top_out"], **mvau_config
+                ),
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", idt)
+    model.set_tensor_layout("top_in", ["N", "H", "W", "C"])
+    model.set_tensor_datatype("top_out", odt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype("thresholds", tdt)
+
+    weights = gen_finn_dt_tensor(wdt, weights_shape)
+    # TODO: thresholds are all the same
+    thresholds = generate_random_threshold_values(
+        tdt, out_ch, odt.get_num_possible_values() - 1, False, True
+    )
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    model.set_initializer("weights", weights)
+    model.set_initializer("thresholds", thresholds)
+
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+
+def combine_blocks(lb, rb, ifm_dim, ch, pe):
+    # assumes left branch (lb) and right branch (rb) each have a single (dynamic) input/output with the same shape
+    # to avoid mix-ups, start by giving all tensors random names
+    lb = lb.transform(GiveRandomTensorNames())
+    rb = rb.transform(GiveRandomTensorNames())
+    # erase all node names to avoid conflict
+    for n in lb.graph.node:
+        n.name = ""
+    for n in rb.graph.node:
+        n.name = ""
+
+    lb_input = lb.graph.input[0]
+    lb_output = lb.graph.output[0]
+    rb_input = rb.graph.input[0]
+    rb_output = rb.graph.output[0]
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch])
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch])
+
+    dup_config = {}
+    dup_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    dup_config["backend"] = "fpgadataflow"
+    dup_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    dup_config["NumChannels"] = ch
+    dup_config["PE"] = pe
+    dup_config["NumOutputStreams"] = 2
+    dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name
+
+    add_config = {}
+    add_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    add_config["backend"] = "fpgadataflow"
+    add_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    add_config["NumChannels"] = ch
+    add_config["PE"] = pe
+    add_config["inputDataType"] = lb.get_tensor_datatype(lb_output.name).name
+
+    nodes_lb = [node for node in lb.graph.node]
+    nodes_rb = [node for node in rb.graph.node]
+    nodes_new = (
+        nodes_lb
+        + nodes_rb
+        + [
+            helper.make_node(
+                "DuplicateStreams_hls", ["top_in"], [lb_input.name, rb_input.name], **dup_config
+            ),
+            helper.make_node(
+                "AddStreams_hls", [lb_output.name, rb_output.name], ["top_out"], **add_config
+            ),
+        ]
+    )
+
+    value_info_lb = [x for x in lb.graph.value_info]
+    value_info_rb = [x for x in rb.graph.value_info]
+    value_info_new = value_info_lb + value_info_rb + [lb_input, lb_output, rb_input, rb_output]
+
+    initializer_lb = [x for x in lb.graph.initializer]
+    initializer_rb = [x for x in rb.graph.initializer]
+    initializer_new = initializer_lb + initializer_rb
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="branching_model",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info_new,
+            nodes=nodes_new,
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", lb.get_tensor_datatype(lb_input.name))
+    model.set_tensor_layout("top_in", lb.get_tensor_layout(lb_input.name))
+    for i in initializer_new:
+        model.graph.initializer.append(i)
+
+    # tidy-up
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(GiveReadableTensorNames())
+    return model
 
 def fetch_test_model(topology, wbits=2, abits=2):
     tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology)
@@ -49,22 +278,33 @@ def fetch_test_model(topology, wbits=2, abits=2):
     export_qonnx(model, torch.randn(ishape), chkpt_name)
     return tmp_output_dir
 
-
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.fpgadataflow
 @pytest.mark.parametrize(
-    "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"]
+    "method",
+    [
+        "largefifo_rtlsim_python",
+        "largefifo_rtlsim_cpp",
+        "characterize_analytic",
+        "characterize_rtl",
+    ],
 )
 @pytest.mark.parametrize("topology", ["tfc", "cnv"])
 def test_fifosizing_linear(method, topology):
     force_python_rtlsim = "python" in method
     method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize"
     tmp_output_dir = fetch_test_model(topology)
+    if method == "characterize_analytic":
+        characterizatio_strategy_key = "analytic"
+    else:
+        characterizatio_strategy_key = "rtlsim"
+
     cfg = build_cfg.DataflowBuildConfig(
         output_dir=tmp_output_dir,
         auto_fifo_depths=True,
         auto_fifo_strategy=method_key,
+        characteristic_function_strategy=characterizatio_strategy_key,
         target_fps=10000 if topology == "tfc" else 1000,
         force_python_rtlsim=force_python_rtlsim,
         synth_clk_period_ns=10.0,
@@ -104,10 +344,217 @@ def test_fifosizing_linear(method, topology):
         node0 = model0.graph.node[i]
         node1 = model1.graph.node[i]
         assert node0.op_type == node1.op_type
-        if node0.op_type == "StreamingFIFO":
+        if node0.op_type == "StreamingFIFO_rtl":
             node0_inst = getCustomOp(node0)
             node1_inst = getCustomOp(node1)
             assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth")
 
     shutil.rmtree(tmp_output_dir)
     shutil.rmtree(tmp_output_dir_cmp)
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize("conv_config", [
+    (32, # dim
+     5, # kernel_size
+     4, # ch
+     4, # simd
+     4, # pe
+     1 # parallel_window
+    ),
+    #(16, 4, 3, 4, 4, 1),
+    #(16, 4, 3, 4, 4, 1)
+    ])
+@pytest.mark.parametrize("lb_num_layers", [1])
+@pytest.mark.parametrize("rb_num_layers", [3])
+@pytest.mark.parametrize("strategy", ["analytical", "rtlsim"])
+def test_fifosizing_nonlinear(conv_config, lb_num_layers, rb_num_layers, strategy):
+    np.random.seed(0)
+    tmp_output_dir = make_build_dir(
+        "test_fifosizing_nonlinear_%s_%s" % (lb_num_layers, rb_num_layers)
+    )
+    log = {}
+
+    #TODO: generalize FIFO test so it can be used by other FIFO-related unit tests
+    #TODO: allow manual folding/fifo config as input
+
+    #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness?
+    #      e.g. reducing > 1 FIFOs simultaneously does not cause a throughput drop while reducing a single FIFO does?
+
+    # conv parameters
+    dim, kernel_size, ch, simd, pe, parallel_window = conv_config
+    log["stategy"] = strategy
+    log["lb_num_layers"] = lb_num_layers
+    log["rb_num_layers"] = rb_num_layers
+    log["dim"] = dim
+    log["kernel_size"] = kernel_size
+    log["ch"] = ch
+    log["simd"] = simd
+    log["pe"] = pe
+    log["parallel_window"] = parallel_window
+
+    # test parameters 
+    #TODO: make configurable
+    #TODO: how to determine rtlsim_n?
+    rtlsim_n = 10
+    throughput_factor_threshold = 0.9
+    fifo_reduction_skip_threshold = 32 # skip FIFO tightness test for shallow FIFOs at or below this depth
+    fifo_reduction_factor = 0.5 # controls tightness
+    fifo_reduction_throughput_drop_threshold = 0.01
+    log["rtlsim_n"] = rtlsim_n
+    log["throughput_factor_threshold"] = throughput_factor_threshold
+    log["fifo_reduction_skip_threshold"] = fifo_reduction_skip_threshold
+    log["fifo_reduction_factor"] = fifo_reduction_factor
+    log["fifo_reduction_throughput_drop_threshold"] = fifo_reduction_throughput_drop_threshold
+
+    lb = None
+    for i in range(lb_num_layers):
+        new_block = make_conv_building_block(
+            dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window
+        )
+        lb = new_block if lb is None else lb.transform(MergeONNXModels(new_block))
+    lb.save(tmp_output_dir + "/lb.onnx")
+
+    rb = None
+    for i in range(rb_num_layers):
+        new_block = make_conv_building_block(
+            dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window
+        )
+        rb = new_block if rb is None else rb.transform(MergeONNXModels(new_block))
+    rb.save(tmp_output_dir + "/rb.onnx")
+
+    model = combine_blocks(lb, rb, dim, ch, pe=4)
+    model.save(tmp_output_dir + "/model.onnx")
+
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        verbose=True, # TODO: remove this?
+        # only works with characterization-based FIFO-sizing
+        auto_fifo_depths=True,
+        auto_fifo_strategy="characterize",
+        characteristic_function_strategy=strategy,
+        split_large_fifos=False,
+        # manual folding
+        target_fps=None,
+        # general rtlsim settings
+        force_python_rtlsim=False,
+        rtlsim_batch_size=rtlsim_n,
+        synth_clk_period_ns=10.0,
+        board="Pynq-Z1",
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+            build_cfg.DataflowOutputType.STITCHED_IP,
+            build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+        ],
+    )
+
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+
+    # load performance reports
+    with open(tmp_output_dir + "/report/estimate_network_performance.json") as f:
+        est_data = json.load(f)
+    with open(tmp_output_dir + "/report/rtlsim_performance.json") as f:
+        sim_data = json.load(f)
+
+    # check for deadlock
+    model_final = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx")
+    first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
+    last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
+    input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * rtlsim_n
+    output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * rtlsim_n
+    deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
+    log["deadlock"] = deadlock.tolist()
+
+    # check rtlsim throughput
+    throughput = sim_data["throughput[images/s]"]
+    stable_throughput = sim_data["stable_throughput[images/s]"]
+    estimated_throughput = est_data["estimated_throughput_fps"]
+    throughput_factor = throughput / estimated_throughput
+    stable_throughput_factor = stable_throughput / estimated_throughput
+
+    # TODO: Take throughput or stable_throughput?
+    throughput_pass = throughput_factor > throughput_factor_threshold
+
+    log["throughput_pass"] = throughput_pass
+    log["throughput"] = throughput
+    log["stable_throughput"] = stable_throughput
+    log["estimated_throughput"] = estimated_throughput
+
+    # log FIFO sizes for easier inspection
+    log["fifo_sizes"] = {}
+    for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"):
+        node_inst = getCustomOp(node)
+        log["fifo_sizes"][node.name] = node_inst.get_nodeattr("depth")
+
+    # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear
+    fifo_reduction_pass = []
+    log["fifo_reduction_results"] = {}
+    model_orig = ModelWrapper(tmp_output_dir + "/intermediate_models/step_hw_ipgen.onnx")
+    for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"):
+        model = copy.deepcopy(model_orig)
+        node = model.get_node_from_name(node_orig.name)
+        node_inst = getCustomOp(node)
+
+        # skip shallow FIFOs
+        # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado?
+        if node_inst.get_nodeattr("depth") <= fifo_reduction_skip_threshold:
+            log["fifo_reduction_results"][node.name] = "skip"
+            continue
+
+        # reduce depth of current FIFO and reset generated code
+        node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * fifo_reduction_factor))
+        node_inst.set_nodeattr("code_gen_dir_ipgen", "")
+        node_inst.set_nodeattr("ip_path", "")
+        node_inst.set_nodeattr("ipgen_path", "")
+
+        # save model variation
+        tmp_output_dir_var = tmp_output_dir + "/variations/" + node.name
+        os.makedirs(tmp_output_dir_var)
+        model.save(tmp_output_dir_var + "/model.onnx")
+
+        # build again, only re-run necessary steps to save time
+        cfg.output_dir = tmp_output_dir_var
+        cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"]
+        build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg)
+
+        # load performance report
+        with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f:
+            sim_data = json.load(f)
+
+        # check for deadlock
+        model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx")
+        first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
+        last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
+        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * rtlsim_n
+        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * rtlsim_n
+        var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
+
+        # check rtlsim throughput
+        var_throughput = sim_data["throughput[images/s]"]
+        var_stable_throughput = sim_data["stable_throughput[images/s]"]
+        # TODO: take throughput or stable_throughput?
+        throughput_drop = (throughput - var_throughput) / throughput
+
+        if var_deadlock:   
+            fifo_reduction_pass.append(True)
+            log["fifo_reduction_results"][node.name] = 1.0
+        elif throughput_drop > fifo_reduction_throughput_drop_threshold:
+            fifo_reduction_pass.append(True)
+            log["fifo_reduction_results"][node.name] = throughput_drop
+        else:
+            fifo_reduction_pass.append(False)
+            log["fifo_reduction_results"][node.name] = "fail (no drop)"
+
+    # log for debugging
+    with open(tmp_output_dir + "/debug.json", "w") as f:
+        json.dump(log, f, indent=4)
+
+    # shutil.rmtree(tmp_output_dir)
+
+    # pass/fail test
+    assert not deadlock, "Deadlock detected, FIFOs too small."
+    assert throughput_pass, "Throughput too low, FIFOs too small."
+    assert all(fifo_reduction_pass), "FIFO tightness test failed, FIFOs too large."
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index 2ad49ae58b..c796ff0d77 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -29,6 +29,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
@@ -47,6 +48,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 
 def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
@@ -172,3 +174,61 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["INT8"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# param datatype
+@pytest.mark.parametrize("pdt", [DataType["INT4"]])
+# folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2])
+# number of input features
+@pytest.mark.parametrize("ich", [16])
+# vecs
+@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]])
+# function
+@pytest.mark.parametrize("func", ["add", "mul"])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_analytical_characterization_channelwise_ops(
+    direction, idt, act, pdt, nf, ich, func, vecs, exec_mode
+):
+    if nf == -1:
+        nf = ich
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate param data
+    C = gen_finn_dt_tensor(pdt, (ich))
+
+    odt = act
+
+    # create model
+    model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs)
+    node_details = ("ChannelWiseOp", C, pe, idt, odt, pdt, func, "hls")
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index dc5dc0c02a..8945d6c941 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -29,6 +29,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
@@ -48,6 +49,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 
 def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw):
@@ -237,3 +239,126 @@ def test_fpgadataflow_slidingwindow(
             assert exp_cycles != 0
         else:
             assert model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl"
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["UINT4"]])
+# kernel size
+@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [2, 4])
+# Stride
+@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [2, 1]])
+# Dilation
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [2, 1]])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 2, 4])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0, 1])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_analytical_characterization_slidingwindow(
+    direction,
+    idt,
+    k,
+    ifm_dim,
+    ifm_ch,
+    stride,
+    dilation,
+    exec_mode,
+    simd,
+    dw,
+    parallel_window,
+    m,
+    flip,
+    impl_style,
+):
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            pytest.skip("Dimension flip would have no effect")
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+        pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+    if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+        pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+    if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+        pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+    model = model.transform(to_hw.InferConvInpGen())
+    node_details = (
+        "ConvolutionInputGenerator",
+        k,
+        ifm_ch,
+        ifm_dim,
+        ofm_dim,
+        stride,
+        dilation,
+        idt,
+        dw,
+        "hls",
+    )
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 87e3267186..cb14ae8507 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -29,6 +29,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 import os
 from onnx import TensorProto, helper
@@ -49,6 +50,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.basic import pynq_part_map
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -162,3 +164,54 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style):
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+# input image dimension
+@pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
+# number of rows and number of cols to add
+@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]])
+# number of channels
+@pytest.mark.parametrize("num_ch", [2, 4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
+# FINN input datatype
+@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]])
+# execution mode
+@pytest.mark.parametrize("mode", ["rtlsim"])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_analytical_characterization_fmpadding(
+    direction, idim, pad, num_ch, simd, idt, mode, impl_style
+):
+    if num_ch % simd != 0:
+        pytest.skip(" num_ch % simd != 0, skipping")
+
+    model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt)
+    model = model.transform(InferShapes())
+    model = model.transform(SetExecMode(mode))
+
+    node_details = ("FMPadding", idim, pad, num_ch, simd, idt, mode, impl_style)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 83ab2ddcaf..241ccdde28 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -29,6 +29,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
@@ -44,7 +45,11 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.test import soft_verify_topk
+from finn.util.test import (
+    compare_two_chr_funcs,
+    get_characteristic_fnc,
+    soft_verify_topk,
+)
 
 
 def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style):
@@ -136,3 +141,53 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style):
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert soft_verify_topk(x, y, k), exec_mode + " failed"
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+@pytest.mark.parametrize("idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]])
+# labels
+@pytest.mark.parametrize("labels", [10, 100])
+# folding
+@pytest.mark.parametrize("fold", [-1, 2, 10])
+# number of top labels to select
+@pytest.mark.parametrize("k", [1, 5])
+# impl style
+@pytest.mark.parametrize("impl_style", ["hls"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_analytical_characterization_labelselect(
+    direction, idt, labels, fold, k, impl_style
+):
+    np.random.seed(0)
+    if fold == -1:
+        pe = 1
+    else:
+        pe = labels // fold
+    assert labels % pe == 0
+
+    if k == -1:
+        k = labels
+
+    model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style)
+    node_details = ("LabelSelect", idt, labels, fold, k, impl_style)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index 1ec77f4eec..a497e5fc2a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -28,6 +28,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 import qonnx.custom_op.general.xnorpopcount as xp
 from onnx import TensorProto, helper
@@ -67,6 +68,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -730,3 +732,80 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns):
     assert (
         output_matmul == output_mvau_rtl_stitch
     ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+# mem_mode: internal_embedded or internal_decoupled
+@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [None, DataType["INT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [8])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [8])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [32])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [32])
+# Backend
+@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_analytical_characterization_mvau(
+    direction, mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style
+):
+    if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None):
+        pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations")
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+    # no activation, produce accumulators
+    T = None
+    tdt = None
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        odt = DataType["UINT32"]
+    else:
+        odt = DataType["INT32"]
+
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+        inst.set_nodeattr("resType", "auto")
+        inst.set_nodeattr("preferred_impl_style", preferred_impl_style)
+
+    node_details = ("MVAU", mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
index c520fb50fc..50d4ada783 100644
--- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
+++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py
@@ -29,6 +29,7 @@
 
 import pytest
 
+import copy
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -48,6 +49,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 
 def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode):
@@ -180,3 +182,76 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil
         # nested for-loops
         # assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
         assert exp_cycles != 0
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]])
+# 1d maxpool
+@pytest.mark.parametrize("dim_1d", [False, True])
+# kernel size
+@pytest.mark.parametrize("k", [2, 4])
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [4, 10])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [1, 3])
+# pe
+@pytest.mark.parametrize("pe", [1, 3])
+# ceil mode
+@pytest.mark.parametrize("ceil_mode", [1])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_analytical_characterization_streamingmaxpool(
+    direction, idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode
+):
+    ifm_dim_h = ifm_dim
+    k_h = k
+    if dim_1d:
+        ifm_dim_w = 1
+        k_w = 1
+    else:
+        ifm_dim_w = ifm_dim_h
+        k_w = k_h
+    ifm_dim = (ifm_dim_h, ifm_dim_w)
+    k = (k_h, k_w)
+
+    stride_h = k_h
+    stride_w = k_w
+    ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, stride_h, 0, ceil_mode)
+    ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, stride_w, 0, ceil_mode)
+    ofm_dim = (ofm_dim_h, ofm_dim_w)
+    if idt == DataType["BIPOLAR"] and dim_1d:
+        pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)")
+    if (ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0) and (not dim_1d):
+        pytest.skip("StreamingMaxPool_2d test w/ ImgDim % PoolDim != 0 not implemented")
+    if pe > ifm_ch:
+        pytest.skip("PE cannot be larger than number of input channels")
+    if pe > 1 and (not dim_1d):
+        pytest.skip("PE>1 only supported for StreamingMaxPool_1d")
+
+    model = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode)
+    model = model.transform(InferStreamingMaxPool())
+    node_details = ("StreamingMaxPool", k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode, "hls")
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index 2079fe7fc5..acc726f039 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -28,6 +28,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
@@ -50,6 +51,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -266,3 +268,155 @@ def test_fpgadataflow_thresholding(
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
         assert exp_cycles != 0
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+@pytest.mark.parametrize("num_input_channels", [6, 16])
+@pytest.mark.parametrize(
+    "num_input_vecs",
+    [
+        [1],
+        [1, 2, 2],
+    ],
+)
+@pytest.mark.parametrize("activation", [DataType["UINT4"], DataType["INT4"], DataType["BIPOLAR"]])
+@pytest.mark.parametrize(
+    "idt_tdt_cfg",
+    [
+        (DataType["INT8"], DataType["INT8"]),
+        (DataType["INT8"], DataType["INT9"]),
+        (DataType["UINT5"], DataType["UINT5"]),
+        (DataType["UINT5"], DataType["UINT6"]),
+    ],
+)
+@pytest.mark.parametrize("fold", [-1, 1, 2])
+@pytest.mark.parametrize("narrow", [True, False])
+@pytest.mark.parametrize("per_tensor", [True, False])
+@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
+@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_analytical_characterization_thresholding(
+    direction,
+    num_input_channels,
+    num_input_vecs,
+    activation,
+    idt_tdt_cfg,
+    fold,
+    narrow,
+    per_tensor,
+    impl_style,
+    exec_mode,
+    mem_mode,
+):
+    # the mem_mode parameter can only be used for the hls thresholding
+    # so the test will only be executed once for impl_style=rtl and once skipped
+    # when the mem_mode is varied. Otherwise, the same test configuration would always
+    # run twice.
+    if impl_style == "rtl" and mem_mode == "internal_decoupled":
+        pytest.skip(
+            "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded"
+        )
+    if narrow and activation == DataType["BIPOLAR"]:
+        pytest.skip("Narrow needs to be false with biploar activation.")
+    input_data_type, threshold_data_type = idt_tdt_cfg
+    num_steps = activation.get_num_possible_values() - 1
+
+    if fold == -1:
+        fold = num_input_channels
+    pe = num_input_channels // fold
+    if num_input_channels % pe != 0:
+        pytest.skip("Invalid folding configuration. Skipping test.")
+
+    output_data_type = activation
+    if activation == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = activation.min()
+        if narrow and activation.signed():
+            activation_bias += 1
+
+    # Generate random thresholds and sort in ascending order
+    thresholds = generate_random_threshold_values(
+        threshold_data_type, num_input_channels, num_steps, narrow, per_tensor
+    )
+
+    # provide non-decreasing/ascending thresholds
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    # Make a Multithreshold graph and convert to thresholding binary search node
+    model = make_single_multithresholding_modelwrapper(
+        thresholds,
+        input_data_type,
+        threshold_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+        num_input_channels,
+    )
+
+    # calculate reference output
+    x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels]))
+
+    input_dict = {model.graph.input[0].name: x}
+    y_expected = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
+
+    if output_data_type == DataType["BIPOLAR"]:
+        # binary to bipolar
+        y_expected = 2 * y_expected - 1
+
+    model = model.transform(InferThresholdingLayer())
+
+    # Transform to the specified implementation style, either the
+    # RTL or HLS according to test parameters
+    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("preferred_impl_style", impl_style)
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(InferShapes())
+    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
+
+    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+    model = model.transform(GiveUniqueNodeNames())
+
+    if impl_style == "hls":
+        inst.set_nodeattr("mem_mode", mem_mode)
+
+    node_details = (
+        "Thresholding",
+        thresholds,
+        input_data_type,
+        threshold_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+        num_input_channels,
+        "hls",
+    )
+
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(
+        model, node_details, test_fpga_part, target_clk_ns, "analytical"
+    )
+    node_rtlsim = get_characteristic_fnc(
+        model_rtl, node_details, test_fpga_part, target_clk_ns, "rtlsim"
+    )
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index 236176faa6..6383d5c609 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -28,6 +28,7 @@
 
 import pytest
 
+import copy
 import numpy as np
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
@@ -66,6 +67,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc
 
 
 def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
@@ -468,3 +470,117 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     assert (
         golden_out == output_vvau_stitched
     ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+
+
+# which port to test
+@pytest.mark.parametrize("direction", ["input", "output"])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None])
+# PE
+@pytest.mark.parametrize("pe", [1, 3, 6])
+# SIMD
+@pytest.mark.parametrize("simd", [1, 9])
+# Input image shape
+@pytest.mark.parametrize("dim_h", [10])
+@pytest.mark.parametrize("dim_w", [10, 1])
+# Kernel shape
+@pytest.mark.parametrize("k_h", [3])
+@pytest.mark.parametrize("k_w", [3, 1])
+# Number of input and output channels
+@pytest.mark.parametrize("channels", [3, 6])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
+# execution mode
+@pytest.mark.parametrize("exec_mode", ["rtlsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_analytical_characterization_vvau(
+    direction, idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode
+):
+    if dim_w == 1 and k_w != 1:
+        pytest.skip("1D image requires 1D kernel, skipping.")
+
+    if channels % pe != 0:
+        pytest.skip("Requirement Channels divisable by PE is violated.")
+
+    if (k_h * k_w) % simd != 0:
+        pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.")
+
+    # Generate weights in expected shape for ONNX and HLS node
+    W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w))  # shape: [channels, 1, k, k]
+
+    # Generate inputs in expected format for ONNX and HLS node
+    x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels))
+    x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe)
+    x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5)
+    x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w)
+
+    if act is None:
+        T = None
+        tdt = None
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
+    else:
+        odt = act
+        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
+        T = np.sort(T, axis=1)
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + (k_h * k_w)) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
+
+    model = _make_single_vvau_modelwrapper(
+        W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
+    )
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    node_details = (
+        "VVAU",
+        W,
+        pe,
+        simd,
+        k_h,
+        k_w,
+        channels,
+        dim_h,
+        dim_w,
+        wdt,
+        idt,
+        odt,
+        T,
+        tdt,
+        mem_mode,
+        "hls",
+    )
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    allowed_chr_offset_positions = 5
+
+    model_rtl = copy.deepcopy(model)
+    node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical")
+    node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim")
+    if direction == "input":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_in"),
+            node_rtlsim.get_nodeattr("io_chrc_in"),
+            allowed_chr_offset_positions,
+        )
+    elif direction == "output":
+        assert compare_two_chr_funcs(
+            node_analytical.get_nodeattr("io_chrc_out"),
+            node_rtlsim.get_nodeattr("io_chrc_out"),
+            allowed_chr_offset_positions,
+        )
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 8a91a49278..9e206c843a 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -50,8 +50,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_cnv_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -64,6 +62,7 @@ def test_streamline_cnv(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_cnv_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx)
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index edc4a96fe2..9ce2f2ab65 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -52,8 +52,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_fc_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -68,6 +66,7 @@ def test_streamline_fc(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx)