diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 5d69802337..62814c487f 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -40,11 +40,16 @@ class AutoFIFOSizingMethod(str, Enum): "Select the type of automatic FIFO sizing strategy." - CHARACTERIZE = "characterize" LARGEFIFO_RTLSIM = "largefifo_rtlsim" +class FIFOCharacterizationMethod(str, Enum): + "Select the strategy for characteristic sizing of FIFOs." + CHARACTERIZE_RTLSIM = "rtlsim" + CHARACTERIZE_ANALYTICAL = "analytical" + + class ShellFlowType(str, Enum): """For builds that produce a bitfile, select the shell flow that will integrate the FINN-generated accelerator.""" @@ -116,9 +121,9 @@ class VerificationStepType(str, Enum): "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", + "step_set_fifo_depths", "step_hw_codegen", "step_hw_ipgen", - "step_set_fifo_depths", "step_create_stitched_ip", "step_measure_rtlsim_performance", "step_out_of_context_synthesis", @@ -273,6 +278,15 @@ class DataflowBuildConfig: #: setting the FIFO sizes. auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + #: Which strategy will be used for characteristic function-based FIFO sizing. + #: CHARACTERIZE_RTLSIM will result in performing RTLSIM for each node + #: to deduce the characteristic functions empirically + #: CHARACTERIZE_ANALYTICAL will use analytical functions if available, avoiding the generation + #: of IP cores. + characteristic_function_strategy: Optional[ + FIFOCharacterizationMethod + ] = FIFOCharacterizationMethod.CHARACTERIZE_RTLSIM + #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test #: if set to True, always using Python instead force_python_rtlsim: Optional[bool] = False diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index ab2280554c..220280031b 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -555,14 +555,18 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(InsertDWC()) model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(AnnotateCycles()) + + period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10) model = model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) + DeriveCharacteristic( + model, + period, + cfg.characteristic_function_strategy, + cfg._resolve_fpga_part(), + cfg._resolve_hls_clk_period(), + ) ) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - model = model.transform(AnnotateCycles()) - period = model.analysis(dataflow_performance)["max_cycles"] + 10 - model = model.transform(DeriveCharacteristic(period)) model = model.transform(DeriveFIFOSizes()) model = model.transform( InsertFIFO( @@ -625,6 +629,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): "depth_trigger_uram", "depth_trigger_bram", ] + extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) # perform FIFO splitting and shallow FIFO removal only after the final config @@ -636,8 +641,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again # this will only run for the new nodes (e.g. FIFOs and DWCs) - model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) - model = model.transform(HLSSynthIP()) + # model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) + # model = model.transform(HLSSynthIP()) return model diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py index ac61786ac1..4af2b64197 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams.py +++ b/src/finn/custom_op/fpgadataflow/addstreams.py @@ -159,7 +159,9 @@ def get_verilog_top_module_intf_names(self): intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] return intf_names - def derive_characteristic_fxns(self, period): + def derive_characteristic_fxns( + self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None + ): n_inps = np.prod(self.get_folded_input_shape()[:-1]) io_dict = { "inputs": { @@ -168,4 +170,6 @@ def derive_characteristic_fxns(self, period): }, "outputs": {"out": []}, } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + super().derive_characteristic_fxns( + model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict + ) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py index 9bf4ebdf62..1f17ddc851 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py @@ -232,3 +232,41 @@ def execute_node(self, context, graph): sess = rt.InferenceSession(model_func.SerializeToString()) result = sess.run(None, idict) context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def prepare_kwargs_for_characteristic_fx(self): + # key parameters + PE = self.get_nodeattr("PE") + NumChannels = self.get_nodeattr("NumChannels") + NF = int(NumChannels / PE) + dim = np.prod(self.get_folded_output_shape()[1:-1]) + # assert True == False + kwargs = (NF, dim) + + # assert True==False + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + # Compute one period of the input characteristic function + + (NF, dim) = kwargs + + for k in range(dim): + txns.append(counter) + counter += 1 + cycles += 1 + + # + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + # Compute one period of the output characteristic function + + (NF, dim) = kwargs + + for k in range(dim): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 1fb4940fb4..c00603f375 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -277,3 +277,243 @@ def execute_node(self, context, graph): # this automatically updates the execution context inst = getCustomOp(im2col_node) inst.execute_node(context, model_im2col.graph) + + def prepare_kwargs_for_characteristic_fx(self): + # key parameters + IFMDim_x = self.get_nodeattr("IFMDim")[0] + OFMDim_x = self.get_nodeattr("OFMDim")[0] + ConvKernelDim_x = self.get_nodeattr("ConvKernelDim")[0] + Stride_x = self.get_nodeattr("Stride")[0] + + OFMDim_y = self.get_nodeattr("OFMDim")[1] + ConvKernelDim_y = self.get_nodeattr("ConvKernelDim")[1] + Stride_y = self.get_nodeattr("Stride")[1] + + SIMD = self.get_nodeattr("SIMD") + + IFMChannels = self.get_nodeattr("IFMChannels") + + DEPTHWISE = self.get_nodeattr("depthwise") + is1d = self.get_nodeattr("is1D") + # m = self.get_nodeattr("m") + # flip = self.get_nodeattr("flip") + + SIMD_COUNT = int(IFMChannels / SIMD) + OUTPUT_SIZE = OFMDim_x * ConvKernelDim_x * SIMD_COUNT + INPUT_SIZE = IFMDim_x * SIMD_COUNT + WINDOW_SIZE = ConvKernelDim_x * SIMD_COUNT + if DEPTHWISE: + BUFFER_SIZE = ConvKernelDim_x * SIMD_COUNT + READ_CYCLES = SIMD_COUNT * (ConvKernelDim_x - 1) - (ConvKernelDim_x - 1) + FINISH = IFMDim_x - ConvKernelDim_x - 2 + else: + BUFFER_SIZE = (ConvKernelDim_x - 1) * SIMD_COUNT + READ_CYCLES = 0 + FINISH = 0 + + OCNT_INITIAL = BUFFER_SIZE + (Stride_x - 1) + + DEFAULT_FIFO_DEPTH = 2 + + multiplying_factor = int(IFMChannels / SIMD) + number_blocks = int(ConvKernelDim_y / Stride_y + 1) + cycles_write_block = OFMDim_x * ConvKernelDim_x * ConvKernelDim_y * multiplying_factor + cycles_read_block = Stride_x * IFMDim_x * multiplying_factor + max_cycles = max(cycles_write_block, cycles_read_block) + baseIter = IFMDim_x * ConvKernelDim_y * multiplying_factor + OFMDim_y * max( + cycles_write_block, cycles_read_block + ) + initial_buffer = IFMDim_x * ConvKernelDim_y * multiplying_factor + + READ_DELAY = ( + number_blocks + * ConvKernelDim_x + * ConvKernelDim_y + * OFMDim_x + * OFMDim_y + * multiplying_factor + - ConvKernelDim_x * ConvKernelDim_y * OFMDim_x + ) + READ_ITES = int((baseIter - OFMDim_y) / max(cycles_write_block, cycles_read_block)) + + # assert True == False + kwargs = ( + SIMD_COUNT, + Stride_x, + Stride_y, + OUTPUT_SIZE, + INPUT_SIZE, + WINDOW_SIZE, + BUFFER_SIZE, + READ_CYCLES, + OCNT_INITIAL, + DEPTHWISE, + DEFAULT_FIFO_DEPTH, + is1d, + multiplying_factor, + number_blocks, + cycles_write_block, + cycles_read_block, + max_cycles, + baseIter, + initial_buffer, + FINISH, + OFMDim_y, + READ_DELAY, + READ_ITES, + ) + + # assert True==False + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + # Compute one period of the input characteristic function + + ( + SIMD_COUNT, + Stride_x, + Stride_y, + OUTPUT_SIZE, + INPUT_SIZE, + WINDOW_SIZE, + BUFFER_SIZE, + READ_CYCLES, + OCNT_INITIAL, + DEPTHWISE, + DEFAULT_FIFO_DEPTH, + is1d, + multiplying_factor, + number_blocks, + cycles_write_block, + cycles_read_block, + max_cycles, + baseIter, + initial_buffer, + FINISH, + OFMDim_y, + READ_DELAY, + READ_ITES, + ) = kwargs + + if DEPTHWISE: + OCNT_MAX = BUFFER_SIZE + ocnt = SIMD_COUNT + + else: + OCNT_MAX = WINDOW_SIZE + if OCNT_INITIAL < WINDOW_SIZE: + ocnt = OCNT_INITIAL + else: + ocnt = -1 + + # fifo filling + for i in range(0, DEFAULT_FIFO_DEPTH): + txns.append(counter) + counter += 1 + cycles += 1 + + # main function + + inp_count = 0 + + if is1d: + for i in range(0, OUTPUT_SIZE): + txns.append(counter) + we = (i < OCNT_MAX) or (ocnt < (SIMD_COUNT * Stride_x)) + re = i > 0 + + if re: + ocnt += 1 + if ocnt == OCNT_MAX: + ocnt = 0 + if we: + if inp_count < INPUT_SIZE - DEFAULT_FIFO_DEPTH: + counter += 1 + inp_count += 1 + + cycles += 1 + else: + for i in range(0, initial_buffer + cycles_read_block - 1): + txns.append(counter) + cycles += 1 + counter += 1 + + txns.append(counter) + cycles += 1 # one extra for loop tail + + for i in range(0, OFMDim_y - 1): + for j in range(0, cycles_write_block - cycles_read_block): + txns.append(counter) + cycles += 1 + + for j in range(0, cycles_read_block - 1): + if i < OFMDim_y - 2: + counter += 1 + txns.append(counter) + cycles += 1 + # else: + # if j < FINISH: + # counter+=1 + # txns.append(counter) + # cycles+=1 + # + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + # Compute one period of the output characteristic function + + ( + SIMD_COUNT, + Stride_x, + Stride_y, + OUTPUT_SIZE, + INPUT_SIZE, + WINDOW_SIZE, + BUFFER_SIZE, + READ_CYCLES, + OCNT_INITIAL, + DEPTHWISE, + DEFAULT_FIFO_DEPTH, + is1d, + multiplying_factor, + number_blocks, + cycles_write_block, + cycles_read_block, + max_cycles, + baseIter, + initial_buffer, + FINISH, + OFMDim_y, + READ_DELAY, + READ_ITES, + ) = kwargs + + # HYPER PARAMETERS + + INITIAL_LOOP_CYCLES = 5 + + if is1d: + for i in range(0, INITIAL_LOOP_CYCLES): + txns.append(counter) + cycles += 1 + + for i in range(0, READ_CYCLES): + txns.append(counter) + cycles += 1 + + for i in range(0, OUTPUT_SIZE): + txns.append(counter) + counter += 1 + cycles += 1 + else: + for i in range(0, initial_buffer + INITIAL_LOOP_CYCLES - 1): + txns.append(counter) + cycles += 1 + + for i in range(0, baseIter - initial_buffer): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py index 8943ffc9e3..ac59868f27 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py @@ -40,20 +40,25 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # how many duplicated output streams to create - "NumOutputStreams": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # how many duplicated output streams to create + "NumOutputStreams": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # TODO: how to set a default value depending on NumOutputStreams? + # transformations like set_fifo_depth expect this attribute for every i/o of every node + "outFIFODepths": ("ints", False, [2, 2]), + } + ) return my_attrs def get_num_output_streams(self): @@ -166,7 +171,9 @@ def get_verilog_top_module_intf_names(self): ) return intf_names - def derive_characteristic_fxns(self, period): + def derive_characteristic_fxns( + self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None + ): n_inps = np.prod(self.get_folded_input_shape()[:-1]) io_dict = { "inputs": { @@ -174,4 +181,7 @@ def derive_characteristic_fxns(self, period): }, "outputs": {"out0": [], "out1": []}, } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + + super().derive_characteristic_fxns( + model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict + ) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py index 5767028ea7..bf1415d4ca 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -170,3 +170,59 @@ def execute_node(self, context, graph): inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" ) context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def prepare_kwargs_for_characteristic_fx(self): + # key parameters + ImgDim = self.get_nodeattr("ImgDim") + Padding = self.get_nodeattr("Padding") + NewDim = [ImgDim[0] + Padding[0] + Padding[2], ImgDim[1] + Padding[1] + Padding[3]] + NumChannels = self.get_nodeattr("NumChannels") + SIMD = self.get_nodeattr("SIMD") + TOTAL_ELS = np.prod(NewDim) + NF = int(NumChannels / SIMD) + + # assert True == False + kwargs = (ImgDim, NewDim, Padding, NumChannels, SIMD, TOTAL_ELS, NF) + + # assert True==False + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + # Compute one period of the input characteristic function + + (ImgDim, NewDim, Padding, NumChannels, SIMD, TOTAL_ELS, NF) = kwargs + + for y in range(0, NewDim[0]): + for x in range(0, NewDim[1]): + for k in range(NF): + txns.append(counter) + if ( + Padding[0] <= y + and (y < (NewDim[0] - Padding[2])) + and Padding[1] <= x + and (x < (NewDim[1] - Padding[3])) + ): + counter += 1 + cycles += 1 + if NF == 1: # loop end delay when fully unrolled + txns.append(counter) + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + # Compute one period of the output characteristic function + + (ImgDim, NewDim, Padding, NumChannels, SIMD, TOTAL_ELS, NF) = kwargs + + for i in range(0, TOTAL_ELS): + for j in range(NF): + txns.append(counter) + counter += 1 + cycles += 1 + if NF == 1: # loop end delay when fully unrolled + txns.append(counter) + cycles += 1 + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index b753bc7a03..0a4ffc3fea 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -735,18 +735,3 @@ def ipgen_extra_directives(self): "Return a list of extra tcl directives for HLS synthesis." return ["config_compile -pipeline_style frp"] - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["internal_decoupled", "external"]: - n_weight_inps = self.calc_tmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index b40b8f3074..423535c859 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -34,7 +34,8 @@ from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple -from finn.util.basic import pyverilate_get_liveness_threshold_cycles +from finn.util.basic import make_build_dir, pyverilate_get_liveness_threshold_cycles +from finn.util.fpgadataflow import is_hls_node try: from pyverilator import PyVerilator @@ -94,8 +95,10 @@ def get_nodeattr_types(self): # the period for which the characterization was run "io_chrc_period": ("i", False, 0), # amount of zero padding inserted during chrc. - "io_chrc_pads_in": ("ints", False, []), - "io_chrc_pads_out": ("ints", False, []), + "io_chrc_pads_in": ("i", False, 0), + "io_chrc_pads_out": ("i", False, 0), + "io_chrc_in_concat": ("t", False, np.asarray([], dtype=np.int32)), + "io_chrc_out_concat": ("t", False, np.asarray([], dtype=np.int32)), } def get_verilog_top_module_name(self): @@ -361,10 +364,162 @@ def get_outstream_width_padded(self, ind=0): out_width = self.get_outstream_width(ind=ind) return roundup_to_integer_multiple(out_width, 8) - def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): + def derive_characteristic_fxns( + self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None + ): + if override_dict is None: + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + else: + io_dict = override_dict + + if strategy == "analytical": + # check for override function + prepare_kwargs_for_characteristic_fx = getattr( + self, "prepare_kwargs_for_characteristic_fx", None + ) + if callable(prepare_kwargs_for_characteristic_fx): + # Analytical flow + self.derive_characteristic_fxns_analytically(period, io_dict=io_dict) + return + + # RTL-based flow + self.derive_characteristic_fxns_rtlsim( + model, period, fpga_part, clk_period, io_dict=io_dict + ) + + def derive_characteristic_fxns_analytically(self, period, io_dict): + # Analytical flow + + txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} + txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key} + + all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) + all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) + + self.set_nodeattr("io_chrc_period", period) + + txn_in = [] + txn_out = [] + + # INPUT + + counter = 0 + padding = 0 + + kwargs = self.prepare_kwargs_for_characteristic_fx() + + # first period + cycles = 0 + txn_in, cycles, counter = self.characteristic_fx_input(txn_in, cycles, counter, kwargs) + + txn_in += [counter] * (period - cycles) + padding += period - cycles + + # second period + cycles = period + txn_in, cycles, counter = self.characteristic_fx_input(txn_in, cycles, counter, kwargs) + + txn_in += [counter] * (period * 2 - cycles) + padding += period * 2 - cycles + + # final assignments + all_txns_in[0, :] = np.array(txn_in[: period * 2]) + self.set_nodeattr("io_chrc_in", all_txns_in) + self.set_nodeattr("io_chrc_pads_in", padding) + + # OUTPUT + + counter = 0 + cycles = 0 + padding = 0 + + txn_out, cycles, counter = self.characteristic_fx_output(txn_out, cycles, counter, kwargs) + + txn_out += [counter] * (period - cycles) + padding += period - cycles + + cycles = period + + txn_out, cycles, counter = self.characteristic_fx_output(txn_out, cycles, counter, kwargs) + + txn_out += [counter] * (period * 2 - cycles) + padding += period * 2 - cycles + + all_txns_out[0, :] = np.array(txn_out[: period * 2]) + self.set_nodeattr("io_chrc_out", all_txns_out) + self.set_nodeattr("io_chrc_pads_out", padding) + + def derive_characteristic_fxns_rtlsim(self, model, period, fpga_part, clk_period, io_dict=None): """Return the unconstrained characteristic functions for this node.""" # ensure rtlsim is ready - assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name + if self.get_nodeattr("rtlsim_so") == "": + # generate the IP for this node + + # lazy construction of prepare_ip step + node = self.onnx_node + op_type = node.op_type + # get the path of the code generation directory + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # ensure that there is a directory + if code_gen_dir == "" or not os.path.isdir(code_gen_dir): + code_gen_dir = make_build_dir( + prefix="code_gen_ipgen_" + str(self.onnx_node.name) + "_" + ) + self.set_nodeattr("code_gen_dir_ipgen", code_gen_dir) + # ensure that there is generated code inside the dir + self.code_generation_ipgen(model, fpga_part, clk_period) + + # lazy construction of hlssynthip step + if is_hls_node(node): + # ensure that code is generated + try: + assert ( + self.get_nodeattr("code_gen_dir_ipgen") != "" + ), """Node + attribute "code_gen_dir_ipgen" is empty. Please run + transformation PrepareIP first.""" + if not os.path.isdir(self.get_nodeattr("ipgen_path")) or not self.get_nodeattr( + "code_gen_dir_ipgen" + ) in self.get_nodeattr("ipgen_path"): + # call the compilation function for this node + self.ipgen_singlenode_code() + else: + warnings.warn("Using pre-existing IP for %s" % self.onnx_node.name) + # ensure that executable path is now set + assert ( + self.get_nodeattr("ipgen_path") != "" + ), """Transformation + HLSSynthIP was not successful. Node attribute "ipgen_path" + is empty.""" + except KeyError: + # exception if op_type is not supported + raise Exception("Custom op_type %s is currently not supported." % op_type) + + # lazy construction of prepare rtlsim step + + try: + self.prepare_rtlsim() + # ensure that executable path is now set + assert ( + self.get_nodeattr("rtlsim_so") != "" + ), "Failed to prepare RTLSim, no rtlsim_so attribute found." + except KeyError: + # exception if op_type is not supported + raise Exception("Custom op_type %s is currently not supported." % op_type) + else: + self.prepare_rtlsim() + # ensure that executable path is now set + assert ( + self.get_nodeattr("rtlsim_so") != "" + ), "Failed to prepare RTLSim, no rtlsim_so attribute found." + + # assert , "rtlsim not ready for " + self.onnx_node.name if self.get_nodeattr("io_chrc_period") > 0: warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name) return @@ -384,15 +539,6 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): sim = self.get_rtlsim() # signal name sname = "_" + self.hls_sname() + "_" - if override_rtlsim_dict is not None: - io_dict = override_rtlsim_dict - else: - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } # extra dicts to keep track of cycle-by-cycle transaction behavior # note that we restrict key names to filter out weight streams etc @@ -447,6 +593,8 @@ def accumulate_char_fxn(chrc): all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) all_pad_in = [] all_pad_out = [] + pad_in = 0 + pad_out = 0 for in_idx, in_strm_nm in enumerate(txns_in.keys()): txn_in = txns_in[in_strm_nm] if len(txn_in) < period: diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py index f4b098cff7..dd88e331a2 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect.py +++ b/src/finn/custom_op/fpgadataflow/labelselect.py @@ -184,3 +184,52 @@ def get_exp_cycles(self): pe = self.get_nodeattr("PE") exp_cycles = nlabels / pe return int(exp_cycles) + + def prepare_kwargs_for_characteristic_fx(self): + # key parameters + + num_in_words = self.get_nodeattr("Labels") + PE = self.get_nodeattr("PE") + K = self.get_nodeattr("K") + + kwargs = (num_in_words, PE, K) + + # assert True==False + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + # Compute one period of the input characteristic function + + (num_in_words, PE, K) = kwargs + + # input + for i in range(0, int(num_in_words / PE) + 1): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + # Compute one period of the output characteristic function + + (num_in_words, PE, K) = kwargs + + windup_clocks = 4 + for i in range(0, windup_clocks): + txns.append(counter) + cycles += 1 + + # first output period, computing Labels + for i in range(0, int(num_in_words / PE + K)): + txns.append(counter) + cycles += 1 + + # output the K labels which got selected + for j in range(0, K): + txns.append(counter) + cycles += 1 + counter += 1 + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 8f0a987bce..890ff1cda2 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -840,21 +840,6 @@ def get_op_and_param_counts(self): ret_dict[thres_param_type] = thres_count return ret_dict - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["internal_decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") @@ -973,3 +958,85 @@ def code_generation_ipi(self): else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd + + def prepare_kwargs_for_characteristic_fx(self): + MW = self.get_nodeattr("MW") + MH = self.get_nodeattr("MH") + + SIMD = self.get_nodeattr("SIMD") + PE = self.get_nodeattr("PE") + numVectors = np.prod(self.get_nodeattr("numInputVectors")) + BURST_SIZE = int(MW / SIMD) + BURST_COUNT = int(MH / PE) + + kwargs = (MW, MH, SIMD, PE, BURST_COUNT, BURST_SIZE, numVectors) + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + (MW, MH, SIMD, PE, BURST_COUNT, BURST_SIZE, numVectors) = kwargs + + tracker = 0 + maximum = numVectors * BURST_SIZE + + if numVectors > 1: + for i in range(2): + txns.append(counter) + counter += 1 + cycles += 1 + tracker += 1 + + for k in range(numVectors): + for j in range(BURST_SIZE): + if tracker < maximum: + txns.append(counter) + counter += 1 + cycles += 1 + tracker += 1 + + for i in range(BURST_COUNT - 1): + for j in range(BURST_SIZE): + txns.append(counter) + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + (MW, MH, SIMD, PE, BURST_COUNT, BURST_SIZE, numVectors) = kwargs + + windup_clocks = 3 + + for i in range(0, windup_clocks): + txns.append(counter) + cycles += 1 + + for k in range(numVectors): + for i in range(BURST_COUNT): + for j in range(BURST_SIZE): + txns.append(counter) + cycles += 1 + counter += 1 + + return txns, cycles, counter + + def derive_characteristic_fxns( + self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None + ): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["internal_decoupled", "external"]: + n_weight_inps = self.calc_wmem() + # num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + # TODO: Why is num_w_reps not considered here? + io_dict["inputs"]["weights"] = [0 for i in range(1 * n_weight_inps)] + + super().derive_characteristic_fxns( + model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict + ) diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py index 35aee023b9..b548548013 100644 --- a/src/finn/custom_op/fpgadataflow/pool.py +++ b/src/finn/custom_op/fpgadataflow/pool.py @@ -222,3 +222,43 @@ def execute_node(self, context, graph): result = np.right_shift(result.astype(int), shift_bits) oshape = context[node.output[0]].shape context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def prepare_kwargs_for_characteristic_fx(self): + # key parameters + Channels = self.get_nodeattr("Channels") + PE = self.get_nodeattr("PE") + KernelSize = np.prod(self.get_nodeattr("KernelSize")) + + # assert True == False + NF = int(Channels / PE) + kwargs = (NF, KernelSize) + + # assert True==False + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + # Compute one period of the input characteristic function + + (NF, KernelSize) = kwargs + + for i in range(0, KernelSize): + for k in range(NF): + txns.append(counter) + counter += 1 + cycles += 1 + + # + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + # Compute one period of the output characteristic function + + (NF, KernelSize) = kwargs + + for i in range(0, KernelSize): + for k in range(NF): + txns.append(counter) + counter += 1 + cycles += 1 + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py index 4921caeb00..d9f07e822f 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py @@ -214,3 +214,145 @@ def lut_estimation(self): cset_luts += outw return int(cnt_luts + cset_luts) + + def prepare_kwargs_for_characteristic_fx(self): + numInWords = int(np.prod(self.get_folded_input_shape()[-2:-1])) + numOutWords = int(np.prod(self.get_folded_output_shape()[-2:-1])) + numReps = int(np.prod(self.get_folded_input_shape()[:1])) + + inWidth = self.get_nodeattr("inWidth") + outWidth = self.get_nodeattr("outWidth") + + kwargs = (numInWords, numOutWords, inWidth, outWidth, numReps) + + # assert True==False + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + (numInWords, numOutWords, inWidth, outWidth, numReps) = kwargs + + # HYPER PARAMETERS WHICH MAY CHANGE OVER TIME + windup_clocks_up_convert_input = 4 + + windup_clocks_down_convert_input = 3 + + windup_clocks_down_convert_output = 4 + windup_clocks_equal_convert_output = 3 + + if numInWords < windup_clocks_up_convert_input: + windup_clocks_up_convert_input = numInWords + + if numInWords < windup_clocks_down_convert_input: + windup_clocks_down_convert_input = numInWords + + if numOutWords < windup_clocks_down_convert_output: + windup_clocks_down_convert_output = numOutWords + + if numOutWords < windup_clocks_equal_convert_output: + windup_clocks_equal_convert_output = numOutWords + + # first input period + tracker = 0 + maximum = numReps * numInWords + + if numReps > 1: + # loop windup + for i in range(2): + txns.append(counter) + counter += 1 + cycles += 1 + tracker += 1 + + for j in range(0, numReps): + for i in range(0, numInWords): + if tracker < maximum: + txns.append(counter) + counter += 1 + cycles += 1 + tracker += 1 + for i in range(0, 1): + txns.append(counter) + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + (numInWords, numOutWords, inWidth, outWidth, numReps) = kwargs + + # HYPER PARAMETERS WHICH MAY CHANGE + windup_clocks_up_convert_input = 3 + windup_clocks_down_convert_input = 2 + + windup_clocks_down_convert_output = 3 + windup_clocks_equal_convert_output = 2 + + if numInWords < windup_clocks_up_convert_input: + windup_clocks_up_convert_input = numInWords + + if numInWords < windup_clocks_down_convert_input: + windup_clocks_down_convert_input = numInWords + + if numOutWords < windup_clocks_down_convert_output: + windup_clocks_down_convert_output = numOutWords + + if numOutWords < windup_clocks_equal_convert_output: + windup_clocks_equal_convert_output = numOutWords + + # calculation to adjust for padding or cropping adding latency + + if outWidth > inWidth: + higher = outWidth + lower = inWidth + else: + higher = inWidth + lower = outWidth + + if higher % lower != 0: + if numInWords * inWidth > numOutWords * outWidth: + pad = False + else: + pad = True + + else: + pad = False + + # windup period + if inWidth == outWidth: + clock = windup_clocks_equal_convert_output + else: + clock = windup_clocks_up_convert_input + for i in range(0, clock): + txns.append(counter) + cycles += 1 + # padding +=1 + + # first input period + + remainder = 0 + + for k in range(numReps): + # windup + txns.append(counter) + cycles += 1 + + for i in range(0, numOutWords): + for j in range(0, int(np.floor(outWidth / inWidth))): + if j != 0: + txns.append(counter) + cycles += 1 + remainder += inWidth + # padding +=1 + + if pad and remainder < outWidth: + print(remainder) + txns.append(counter) + remainder += inWidth + cycles += 1 + + txns.append(counter) + cycles += 1 + + counter += 1 + remainder -= outWidth + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py index 59a8f092d0..92c004d90a 100755 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py @@ -234,3 +234,240 @@ def execute_node(self, context, graph): # convert output NCHW -> NHWC result = np.transpose(result, (0, 2, 3, 1)) context[node.output[0]] = result + + def prepare_kwargs_for_characteristic_fx(self): + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + ceil_mode = self.get_nodeattr("CeilMode") + output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + is1d = self.is_1d() + + NumChannels = self.get_nodeattr("NumChannels") + PoolDim = self.get_nodeattr("PoolDim")[0] + ImgDim = self.get_nodeattr("ImgDim")[0] + + # SIMD = self.get_nodeattr("SIMD") + PE = self.get_nodeattr("PE") + + windup_clocks = 4 + read_delay = 5 + + # for i in range(0,windup_clocks): + # txn_out[cycles] = i + # cycles+=1 + # p+=1 + + bursts = int(read_delay + ImgDim / PoolDim) + read_tail_latency = 6 + write_tail_latency = 14 + + kwargs = ( + ifm_dim, + output_size, + is1d, + NumChannels, + PoolDim, + ImgDim, + PE, + windup_clocks, + read_delay, + bursts, + read_tail_latency, + write_tail_latency, + ) + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + ( + ifm_dim, + output_size, + is1d, + NumChannels, + PoolDim, + ImgDim, + PE, + windup_clocks, + read_delay, + bursts, + read_tail_latency, + write_tail_latency, + ) = kwargs + + if ImgDim > PoolDim * output_size: + REMAINDER_PIXELS = ImgDim - output_size * PoolDim + else: + REMAINDER_PIXELS = 0 + + tracker = 0 + maximum = int(ImgDim / PoolDim * PoolDim * ImgDim / PoolDim * PoolDim) + input_count = 0 + + if not is1d: + # if i == 0: + for z in range(0, 2): + txns.append(counter) + counter += 1 + cycles += 1 + tracker += 1 + + if int(ImgDim / PoolDim) > 2: + txns.append(counter) + cycles += 1 + + for j in range(0, int(ImgDim / PoolDim)): + for k in range(0, int(PoolDim)): + for z in range(0, int(ImgDim / PoolDim)): + # actual read loop + for x in range(0, PoolDim): + if tracker < maximum: + txns.append(counter) + counter += 1 + cycles += 1 + tracker += 1 + + for k in range(0, int(PoolDim)): + # read loop tail end + for z in range(0, read_tail_latency): + txns.append(counter) + cycles += 1 + + # write delay + for z in range(0, int(ImgDim / PoolDim)): + txns.append(counter) + cycles += 1 + + # for k in range(0, int(PoolDim)): + # read loop tail end + for z in range(0, read_tail_latency - 2): + txns.append(counter) + cycles += 1 + + else: + # 1d case + + # initial buffer space + # for k in range(int(NumChannels / PE)): + # txns.append(counter) + # cycles += 1 + + for i in range(output_size): + for z in range(0, PoolDim): + if input_count < ImgDim: + for k in range(int(NumChannels / PE)): + txns.append(counter) + counter += 1 + cycles += 1 + input_count += 1 + txns.append(counter) + cycles += 1 + + # read loop tail end + # for z in range(0, read_tail_latency): + # txns.append(counter) + # cycles += 1 + + for k in range(int(NumChannels / PE)): + txns.append(counter) + cycles += 1 + + # read loop tail end + for z in range(0, write_tail_latency): + txns.append(counter) + cycles += 1 + + for k in range(int(REMAINDER_PIXELS * NumChannels / PE)): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + ( + ifm_dim, + output_size, + is1d, + NumChannels, + PoolDim, + ImgDim, + PE, + windup_clocks, + read_delay, + bursts, + read_tail_latency, + write_tail_latency, + ) = kwargs + + txns.append(counter) + cycles += 1 + tracker = 0 + maximum = int(ImgDim / PoolDim * PoolDim * ImgDim / PoolDim * PoolDim) + + if not is1d: + # if i == 0: + for z in range(0, 2): + txns.append(counter) + # counter += 1 + cycles += 1 + tracker += 1 + + if int(ImgDim / PoolDim) > 2: + txns.append(counter) + cycles += 1 + + for j in range(0, int(ImgDim / PoolDim)): + for k in range(0, int(PoolDim)): + for z in range(0, int(ImgDim / PoolDim)): + # actual read loop + for x in range(0, PoolDim): + if tracker < maximum: + txns.append(counter) + cycles += 1 + tracker += 1 + + for k in range(0, int(PoolDim)): + # read loop tail end + for z in range(0, read_tail_latency): + txns.append(counter) + cycles += 1 + + # write delay + for z in range(0, int(ImgDim / PoolDim)): + txns.append(counter) + counter += 1 + cycles += 1 + + # for k in range(0, int(PoolDim)): + # read loop tail end + for z in range(0, read_tail_latency - 2): + txns.append(counter) + cycles += 1 + + else: + # 1d case + # initial buffer space + # for k in range(int(NumChannels / PE)): + # txns.append(counter) + # cycles += 1 + + for i in range(output_size): + for z in range(0, PoolDim): + for k in range(int(NumChannels / PE)): + txns.append(counter) + cycles += 1 + + for z in range(0, read_tail_latency): + txns.append(counter) + cycles += 1 + + for k in range(int(NumChannels / PE)): + txns.append(counter) + counter += 1 + cycles += 1 + + # for z in range(0,PoolDim): + # for k in range(0,read_tail_latency): + # txns.append(counter) + # cycles+=1 + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 12cb76be4e..e9b0b17d73 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -264,3 +264,40 @@ def calc_tmem(self): num_channels = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") return num_channels // pe + + def prepare_kwargs_for_characteristic_fx(self): + NumChannels = self.get_nodeattr("NumChannels") + PE = self.get_nodeattr("PE") + reps = 1 + ImgDim = int(np.prod(list(self.get_nodeattr("numInputVectors")))) + NF = int(NumChannels / PE) + + TOTAL_ITERATIONS = reps * ImgDim * NF + + kwargs = (TOTAL_ITERATIONS, NumChannels, PE, reps, ImgDim, NF) + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + (TOTAL_ITERATIONS, NumChannels, PE, reps, ImgDim, NF) = kwargs + for i in range(0, TOTAL_ITERATIONS): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + (TOTAL_ITERATIONS, NumChannels, PE, reps, ImgDim, NF) = kwargs + + windup = 6 + for i in range(0, windup): + txns.append(counter) + cycles += 1 + # first input period + for i in range(0, TOTAL_ITERATIONS): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index d95c6eb7cc..7f2a1bbfa5 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -789,21 +789,6 @@ def get_op_and_param_counts(self): ret_dict[thres_param_type] = thres_count return ret_dict - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["internal_decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") @@ -922,3 +907,90 @@ def code_generation_ipi(self): else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") return cmd + + def prepare_kwargs_for_characteristic_fx(self): + # key parameters + if "hls" in self.onnx_node.name: + impl_style = "hls" + else: + impl_style = "rtl" + + SIMD = self.get_nodeattr("SIMD") + PE = self.get_nodeattr("PE") + Channels = self.get_nodeattr("Channels") + Kernel_2 = np.prod(self.get_nodeattr("Kernel")) + NF = int(Channels / PE) + SF = Kernel_2 + numReps = np.prod(self.get_nodeattr("Dim")) + TOTAL_FOLD = NF * SF * numReps + + if impl_style == "rtl": + TOTAL_FOLD = int(TOTAL_FOLD / SIMD) + + kwargs = (NF, SF, SIMD, TOTAL_FOLD, impl_style) + + # assert True==False + + return kwargs + + def characteristic_fx_input(self, txns, cycles, counter, kwargs): + # Compute one period of the input characteristic function + + (NF, SF, SIMD, TOTAL_FOLD, impl_style) = kwargs + + # input + for i in range(0, TOTAL_FOLD): + txns.append(counter) + counter += 1 + cycles += 1 + + return txns, cycles, counter + + def characteristic_fx_output(self, txns, cycles, counter, kwargs): + # Compute one period of the output characteristic function + + (NF, SF, SIMD, TOTAL_FOLD, impl_style) = kwargs + sf = 0 + if impl_style == "hls": + windup = 5 + else: + windup = 7 + + for i in range(0, windup): + txns.append(counter) + cycles += 1 + + # first input period + # txn_in[0:bursts] = np.arange(0,bursts) + for i in range(0, TOTAL_FOLD + 1): + if sf == SF: + counter += 1 + sf = 0 + sf += 1 + # txn_in[cycles] = p_in + txns.append(counter) + cycles += 1 + # p = bursts + + return txns, cycles, counter + + def derive_characteristic_fxns( + self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None + ): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["internal_decoupled", "external"]: + n_weight_inps = self.calc_wmem() + # num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(1 * n_weight_inps)] + + super().derive_characteristic_fxns( + model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict + ) diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py index 4d3ac7dc67..4c19db4d8b 100644 --- a/src/finn/transformation/fpgadataflow/derive_characteristic.py +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -52,10 +52,15 @@ class DeriveCharacteristic(NodeLocalTransformation): NodeLocalTransformation for more details. """ - def __init__(self, period, num_workers=None, manual_bypass=False): + def __init__( + self, model, period, strategy, fpga_part, clk_period, num_workers=None + ): super().__init__(num_workers=num_workers) + self.model = model self.period = period - self.manual_bypass = manual_bypass + self.strategy = strategy + self.fpga_part = fpga_part + self.clk_period = clk_period def applyNodeLocal(self, node): op_type = node.op_type @@ -63,7 +68,15 @@ def applyNodeLocal(self, node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) - inst.derive_characteristic_fxns(period=self.period) + + inst.derive_characteristic_fxns( + model=self.model, + period=self.period, + strategy=self.strategy, + fpga_part=self.fpga_part, + clk_period=self.clk_period, + op_type=op_type, + ) except KeyError: # exception if op_type is not supported raise Exception("Custom op_type %s is currently not supported." % op_type) @@ -71,47 +84,6 @@ def applyNodeLocal(self, node): def apply(self, model: ModelWrapper): (model, run_again) = super().apply(model) - if not self.manual_bypass: - return (model, run_again) - # apply manual fix for DuplicateStreams and AddStreams for - # simple residual reconvergent paths with bypass - addstrm_nodes = model.get_nodes_by_op_type("AddStreams_hls") - for addstrm_node in addstrm_nodes: - # we currently only support the case where one branch is - # a bypass - b0 = model.find_producer(addstrm_node.input[0]) - b1 = model.find_producer(addstrm_node.input[1]) - if (b0 is None) or (b1 is None): - warnings.warn("Found unsupported AddStreams, skipping") - return (model, run_again) - b0_is_bypass = b0.op_type == "DuplicateStreams_hls" - b1_is_bypass = b1.op_type == "DuplicateStreams_hls" - if (not b0_is_bypass) and (not b1_is_bypass): - warnings.warn("Found unsupported AddStreams, skipping") - return (model, run_again) - ds_node = b0 if b0_is_bypass else b1 - comp_branch_last = b1 if b0_is_bypass else b0 - - ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1] - comp_branch_first = model.find_consumer(ds_comp_bout) - if comp_branch_first is None or comp_branch_last is None: - warnings.warn("Found unsupported DuplicateStreams, skipping") - return (model, run_again) - comp_branch_last = registry.getCustomOp(comp_branch_last) - comp_branch_first = registry.getCustomOp(comp_branch_first) - # for DuplicateStreams, use comp_branch_first's input characterization - # for AddStreams, use comp_branch_last's output characterization - period = comp_branch_first.get_nodeattr("io_chrc_period") - comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[: 2 * period] - comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[2 * period :] - ds_node_inst = registry.getCustomOp(ds_node) - addstrm_node_inst = registry.getCustomOp(addstrm_node) - ds_node_inst.set_nodeattr("io_chrc_period", period) - ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2) - addstrm_node_inst.set_nodeattr("io_chrc_period", period) - addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2) - warnings.warn(f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}") - warnings.warn(f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}") return (model, run_again) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index d4cc6dcc99..523cb020e4 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -46,6 +46,7 @@ def _codegen_single_node(node, model): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) + # get the path of the code generation directory code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim") # ensure that there is a directory diff --git a/src/finn/util/test.py b/src/finn/util/test.py index 2115e058a8..ea402d1c89 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -39,11 +39,28 @@ from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy -from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map +from finn.util.basic import ( + alveo_default_platform, + alveo_part_map, + make_build_dir, + pynq_part_map, +) # map of (wbits,abits) -> model example_map = { @@ -184,3 +201,58 @@ def resize_smaller_side(target_pixels, img): def crop_center(size, img): """Crop central size*size window out of a PIL image.""" return torchvision_util.center_crop(img, size) + + +def compare_two_chr_funcs(a, b, relaxation): + # relaxation determines how much leeway we allow for the + # analytical implementation to be off from RTL ground truth + equal = True + for inp in range(len(a)): + for i in range(len(a[inp])): + if (a[inp][i] > (b[inp][i] + relaxation)) or (a[inp][i] < (b[inp][i] - relaxation)): + equal = False + return equal + + +def get_characteristic_fnc(model, node, part, target_clk_ns, strategy): + # If set to True: attempt to cache a pre-existing variant of the model + # this is to avoid generating RTL multiple times during + # test debugging + caching = False + model_cache = None + + if strategy == "rtlsim" and caching: + build_dir = os.environ["FINN_BUILD_DIR"] + for x in os.listdir(build_dir): + if x.startswith(str(node)): + model_cache = f"{build_dir}/{x}/model.onnx" + + make_build_dir("build_fifosizing") + if model_cache is not None: + model = ModelWrapper(model_cache) + + if model_cache is None: + model = model.transform(SpecializeLayers(part)) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(GiveUniqueNodeNames()) + if strategy == "rtlsim": + model = model.transform(PrepareIP(part, target_clk_ns)) + model = model.transform(AnnotateCycles()) + + period = int(model.analysis(dataflow_performance)["max_cycles"] * 3 + 10) + + model = model.transform( + DeriveCharacteristic( + model, + period, + strategy, + part, + target_clk_ns, + ) + ) + if caching: + tmp_caching_output_dir = make_build_dir(str(node)) + model.save(tmp_caching_output_dir + "/model.onnx") + + return getCustomOp(model.graph.node[0]) diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py index 842d099f57..a7a73a5ed4 100644 --- a/tests/brevitas/test_brevitas_fc.py +++ b/tests/brevitas/test_brevitas_fc.py @@ -45,8 +45,6 @@ from finn.util.basic import make_build_dir from finn.util.test import get_test_model_trained -export_onnx_path = make_build_dir("test_brevitas_fc_") - @pytest.mark.brevitas_export # act bits @@ -61,6 +59,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits): if wbits > abits: pytest.skip("No wbits > abits cases at the moment") nname = "%s_%dW%dA" % (size, wbits, abits) + export_onnx_path = make_build_dir("test_brevitas_fc_") finn_onnx = export_onnx_path + "/%s.onnx" % nname fc = get_test_model_trained(size, wbits, abits) ishape = (1, 1, 28, 28) diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index 338204c0c7..31ebe96b33 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -30,17 +30,246 @@ import pytest import json +import numpy as np +import os import shutil import torch +import copy from brevitas.export import export_qonnx +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.registry import getCustomOp - +from qonnx.transformation.general import ( + GiveRandomTensorNames, + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.merge_onnx_models import MergeONNXModels +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.builder.build_dataflow as build import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import make_build_dir from finn.util.test import get_trained_network_and_ishape +def generate_random_threshold_values( + data_type, num_input_channels, num_steps, narrow=False, per_tensor=False +): + if per_tensor: + num_input_channels = 1 + if narrow: + num_steps -= 1 + + return np.random.randint( + data_type.min(), + data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + +def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0): + # hardcoded parameters + idt = DataType["UINT4"] + wdt = DataType["UINT4"] + odt = DataType["UINT4"] + tdt = DataType["UINT32"] + stride = 1 + in_ch = out_ch = ch # input channel = output channel for stacking + # pad so that input dim = output dim for stacking (only supports odd kernel_size for now) + pad = int(np.floor(kernel_size / 2)) + + total_pad = 2 * pad + out_feature_dim = compute_conv_output_dim(ifm_dim, kernel_size, stride, total_pad) + weights_shape = [in_ch * kernel_size * kernel_size, out_ch] + thresholds_shape = [1, odt.get_num_possible_values() - 1] + input_shape = [1, ifm_dim, ifm_dim, in_ch] + padding_out_shape = [1, ifm_dim + total_pad, ifm_dim + total_pad, in_ch] + inpgen_out_shape = [1, out_feature_dim, out_feature_dim, in_ch * kernel_size * kernel_size] + output_shape = [1, out_feature_dim, out_feature_dim, out_ch] + + assert input_shape == output_shape, "ERROR: Conv layer dimensions not stackable" + + padding_config = {} + padding_config["domain"] = "finn.custom_op.fpgadataflow.rtl" + padding_config["backend"] = "fpgadataflow" + padding_config["ImgDim"] = [ifm_dim, ifm_dim] + padding_config["NumChannels"] = in_ch + padding_config["SIMD"] = simd + padding_config["Padding"] = [pad, pad, pad, pad] + padding_config["inputDataType"] = idt.name + + inpgen_config = {} + inpgen_config["domain"] = "finn.custom_op.fpgadataflow.rtl" + inpgen_config["backend"] = "fpgadataflow" + inpgen_config["ConvKernelDim"] = [kernel_size, kernel_size] + inpgen_config["IFMChannels"] = in_ch + inpgen_config["IFMDim"] = [ifm_dim + total_pad, ifm_dim + total_pad] + inpgen_config["OFMDim"] = [ifm_dim, ifm_dim] + inpgen_config["inputDataType"] = idt.name + inpgen_config["outputDataType"] = idt.name + inpgen_config["SIMD"] = simd + inpgen_config["parallel_window"] = parallel_window + inpgen_config["Stride"] = [stride, stride] + inpgen_config["Dilation"] = [1, 1] + + mvau_config = {} + mvau_config["domain"] = "finn.custom_op.fpgadataflow.hls" + mvau_config["backend"] = "fpgadataflow" + mvau_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + mvau_config["MW"] = in_ch * kernel_size * kernel_size + mvau_config["MH"] = in_ch + mvau_config["SIMD"] = simd if parallel_window == 0 else simd * kernel_size * kernel_size + mvau_config["PE"] = pe + mvau_config["resType"] = "lut" + mvau_config["mem_mode"] = "internal_embedded" # internal_decoupled + mvau_config["inputDataType"] = idt.name + mvau_config["weightDataType"] = wdt.name + mvau_config["outputDataType"] = odt.name + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + value_info = [ + helper.make_tensor_value_info("weights", TensorProto.FLOAT, weights_shape), + helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, thresholds_shape), + helper.make_tensor_value_info("padding_out", TensorProto.FLOAT, padding_out_shape), + helper.make_tensor_value_info("inpgen_out", TensorProto.FLOAT, inpgen_out_shape), + ] + + modelproto = qonnx_make_model( + helper.make_graph( + name="building_block", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + helper.make_node("FMPadding_rtl", ["top_in"], ["padding_out"], **padding_config), + helper.make_node( + "ConvolutionInputGenerator_rtl", + ["padding_out"], + ["inpgen_out"], + **inpgen_config, + ), + helper.make_node( + "MVAU_hls", ["inpgen_out", "weights", "thresholds"], ["top_out"], **mvau_config + ), + ], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", idt) + model.set_tensor_layout("top_in", ["N", "H", "W", "C"]) + model.set_tensor_datatype("top_out", odt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype("thresholds", tdt) + + weights = gen_finn_dt_tensor(wdt, weights_shape) + # TODO: thresholds are all the same + thresholds = generate_random_threshold_values( + tdt, out_ch, odt.get_num_possible_values() - 1, False, True + ) + thresholds = sort_thresholds_increasing(thresholds) + + model.set_initializer("weights", weights) + model.set_initializer("thresholds", thresholds) + + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def combine_blocks(lb, rb, ifm_dim, ch, pe): + # assumes left branch (lb) and right branch (rb) each have a single (dynamic) input/output with the same shape + # to avoid mix-ups, start by giving all tensors random names + lb = lb.transform(GiveRandomTensorNames()) + rb = rb.transform(GiveRandomTensorNames()) + # erase all node names to avoid conflict + for n in lb.graph.node: + n.name = "" + for n in rb.graph.node: + n.name = "" + + lb_input = lb.graph.input[0] + lb_output = lb.graph.output[0] + rb_input = rb.graph.input[0] + rb_output = rb.graph.output[0] + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch]) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch]) + + dup_config = {} + dup_config["domain"] = "finn.custom_op.fpgadataflow.hls" + dup_config["backend"] = "fpgadataflow" + dup_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + dup_config["NumChannels"] = ch + dup_config["PE"] = pe + dup_config["NumOutputStreams"] = 2 + dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name + + add_config = {} + add_config["domain"] = "finn.custom_op.fpgadataflow.hls" + add_config["backend"] = "fpgadataflow" + add_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + add_config["NumChannels"] = ch + add_config["PE"] = pe + add_config["inputDataType"] = lb.get_tensor_datatype(lb_output.name).name + + nodes_lb = [node for node in lb.graph.node] + nodes_rb = [node for node in rb.graph.node] + nodes_new = ( + nodes_lb + + nodes_rb + + [ + helper.make_node( + "DuplicateStreams_hls", ["top_in"], [lb_input.name, rb_input.name], **dup_config + ), + helper.make_node( + "AddStreams_hls", [lb_output.name, rb_output.name], ["top_out"], **add_config + ), + ] + ) + + value_info_lb = [x for x in lb.graph.value_info] + value_info_rb = [x for x in rb.graph.value_info] + value_info_new = value_info_lb + value_info_rb + [lb_input, lb_output, rb_input, rb_output] + + initializer_lb = [x for x in lb.graph.initializer] + initializer_rb = [x for x in rb.graph.initializer] + initializer_new = initializer_lb + initializer_rb + modelproto = qonnx_make_model( + helper.make_graph( + name="branching_model", + inputs=[top_in], + outputs=[top_out], + value_info=value_info_new, + nodes=nodes_new, + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", lb.get_tensor_datatype(lb_input.name)) + model.set_tensor_layout("top_in", lb.get_tensor_layout(lb_input.name)) + for i in initializer_new: + model.graph.initializer.append(i) + + # tidy-up + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(GiveReadableTensorNames()) + return model def fetch_test_model(topology, wbits=2, abits=2): tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology) @@ -49,22 +278,33 @@ def fetch_test_model(topology, wbits=2, abits=2): export_qonnx(model, torch.randn(ishape), chkpt_name) return tmp_output_dir - @pytest.mark.slow @pytest.mark.vivado @pytest.mark.fpgadataflow @pytest.mark.parametrize( - "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"] + "method", + [ + "largefifo_rtlsim_python", + "largefifo_rtlsim_cpp", + "characterize_analytic", + "characterize_rtl", + ], ) @pytest.mark.parametrize("topology", ["tfc", "cnv"]) def test_fifosizing_linear(method, topology): force_python_rtlsim = "python" in method method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize" tmp_output_dir = fetch_test_model(topology) + if method == "characterize_analytic": + characterizatio_strategy_key = "analytic" + else: + characterizatio_strategy_key = "rtlsim" + cfg = build_cfg.DataflowBuildConfig( output_dir=tmp_output_dir, auto_fifo_depths=True, auto_fifo_strategy=method_key, + characteristic_function_strategy=characterizatio_strategy_key, target_fps=10000 if topology == "tfc" else 1000, force_python_rtlsim=force_python_rtlsim, synth_clk_period_ns=10.0, @@ -104,10 +344,217 @@ def test_fifosizing_linear(method, topology): node0 = model0.graph.node[i] node1 = model1.graph.node[i] assert node0.op_type == node1.op_type - if node0.op_type == "StreamingFIFO": + if node0.op_type == "StreamingFIFO_rtl": node0_inst = getCustomOp(node0) node1_inst = getCustomOp(node1) assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth") shutil.rmtree(tmp_output_dir) shutil.rmtree(tmp_output_dir_cmp) + + +@pytest.mark.slow +@pytest.mark.vivado +@pytest.mark.fpgadataflow +@pytest.mark.parametrize("conv_config", [ + (32, # dim + 5, # kernel_size + 4, # ch + 4, # simd + 4, # pe + 1 # parallel_window + ), + #(16, 4, 3, 4, 4, 1), + #(16, 4, 3, 4, 4, 1) + ]) +@pytest.mark.parametrize("lb_num_layers", [1]) +@pytest.mark.parametrize("rb_num_layers", [3]) +@pytest.mark.parametrize("strategy", ["analytical", "rtlsim"]) +def test_fifosizing_nonlinear(conv_config, lb_num_layers, rb_num_layers, strategy): + np.random.seed(0) + tmp_output_dir = make_build_dir( + "test_fifosizing_nonlinear_%s_%s" % (lb_num_layers, rb_num_layers) + ) + log = {} + + #TODO: generalize FIFO test so it can be used by other FIFO-related unit tests + #TODO: allow manual folding/fifo config as input + + #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness? + # e.g. reducing > 1 FIFOs simultaneously does not cause a throughput drop while reducing a single FIFO does? + + # conv parameters + dim, kernel_size, ch, simd, pe, parallel_window = conv_config + log["stategy"] = strategy + log["lb_num_layers"] = lb_num_layers + log["rb_num_layers"] = rb_num_layers + log["dim"] = dim + log["kernel_size"] = kernel_size + log["ch"] = ch + log["simd"] = simd + log["pe"] = pe + log["parallel_window"] = parallel_window + + # test parameters + #TODO: make configurable + #TODO: how to determine rtlsim_n? + rtlsim_n = 10 + throughput_factor_threshold = 0.9 + fifo_reduction_skip_threshold = 32 # skip FIFO tightness test for shallow FIFOs at or below this depth + fifo_reduction_factor = 0.5 # controls tightness + fifo_reduction_throughput_drop_threshold = 0.01 + log["rtlsim_n"] = rtlsim_n + log["throughput_factor_threshold"] = throughput_factor_threshold + log["fifo_reduction_skip_threshold"] = fifo_reduction_skip_threshold + log["fifo_reduction_factor"] = fifo_reduction_factor + log["fifo_reduction_throughput_drop_threshold"] = fifo_reduction_throughput_drop_threshold + + lb = None + for i in range(lb_num_layers): + new_block = make_conv_building_block( + dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window + ) + lb = new_block if lb is None else lb.transform(MergeONNXModels(new_block)) + lb.save(tmp_output_dir + "/lb.onnx") + + rb = None + for i in range(rb_num_layers): + new_block = make_conv_building_block( + dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window + ) + rb = new_block if rb is None else rb.transform(MergeONNXModels(new_block)) + rb.save(tmp_output_dir + "/rb.onnx") + + model = combine_blocks(lb, rb, dim, ch, pe=4) + model.save(tmp_output_dir + "/model.onnx") + + cfg = build_cfg.DataflowBuildConfig( + output_dir=tmp_output_dir, + verbose=True, # TODO: remove this? + # only works with characterization-based FIFO-sizing + auto_fifo_depths=True, + auto_fifo_strategy="characterize", + characteristic_function_strategy=strategy, + split_large_fifos=False, + # manual folding + target_fps=None, + # general rtlsim settings + force_python_rtlsim=False, + rtlsim_batch_size=rtlsim_n, + synth_clk_period_ns=10.0, + board="Pynq-Z1", + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + ], + ) + + build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) + + # load performance reports + with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: + est_data = json.load(f) + with open(tmp_output_dir + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + + # check for deadlock + model_final = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx") + first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) + last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * rtlsim_n + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * rtlsim_n + deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected + log["deadlock"] = deadlock.tolist() + + # check rtlsim throughput + throughput = sim_data["throughput[images/s]"] + stable_throughput = sim_data["stable_throughput[images/s]"] + estimated_throughput = est_data["estimated_throughput_fps"] + throughput_factor = throughput / estimated_throughput + stable_throughput_factor = stable_throughput / estimated_throughput + + # TODO: Take throughput or stable_throughput? + throughput_pass = throughput_factor > throughput_factor_threshold + + log["throughput_pass"] = throughput_pass + log["throughput"] = throughput + log["stable_throughput"] = stable_throughput + log["estimated_throughput"] = estimated_throughput + + # log FIFO sizes for easier inspection + log["fifo_sizes"] = {} + for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"): + node_inst = getCustomOp(node) + log["fifo_sizes"][node.name] = node_inst.get_nodeattr("depth") + + # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear + fifo_reduction_pass = [] + log["fifo_reduction_results"] = {} + model_orig = ModelWrapper(tmp_output_dir + "/intermediate_models/step_hw_ipgen.onnx") + for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"): + model = copy.deepcopy(model_orig) + node = model.get_node_from_name(node_orig.name) + node_inst = getCustomOp(node) + + # skip shallow FIFOs + # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado? + if node_inst.get_nodeattr("depth") <= fifo_reduction_skip_threshold: + log["fifo_reduction_results"][node.name] = "skip" + continue + + # reduce depth of current FIFO and reset generated code + node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * fifo_reduction_factor)) + node_inst.set_nodeattr("code_gen_dir_ipgen", "") + node_inst.set_nodeattr("ip_path", "") + node_inst.set_nodeattr("ipgen_path", "") + + # save model variation + tmp_output_dir_var = tmp_output_dir + "/variations/" + node.name + os.makedirs(tmp_output_dir_var) + model.save(tmp_output_dir_var + "/model.onnx") + + # build again, only re-run necessary steps to save time + cfg.output_dir = tmp_output_dir_var + cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"] + build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg) + + # load performance report + with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + + # check for deadlock + model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx") + first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) + last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * rtlsim_n + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * rtlsim_n + var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected + + # check rtlsim throughput + var_throughput = sim_data["throughput[images/s]"] + var_stable_throughput = sim_data["stable_throughput[images/s]"] + # TODO: take throughput or stable_throughput? + throughput_drop = (throughput - var_throughput) / throughput + + if var_deadlock: + fifo_reduction_pass.append(True) + log["fifo_reduction_results"][node.name] = 1.0 + elif throughput_drop > fifo_reduction_throughput_drop_threshold: + fifo_reduction_pass.append(True) + log["fifo_reduction_results"][node.name] = throughput_drop + else: + fifo_reduction_pass.append(False) + log["fifo_reduction_results"][node.name] = "fail (no drop)" + + # log for debugging + with open(tmp_output_dir + "/debug.json", "w") as f: + json.dump(log, f, indent=4) + + # shutil.rmtree(tmp_output_dir) + + # pass/fail test + assert not deadlock, "Deadlock detected, FIFOs too small." + assert throughput_pass, "Throughput too low, FIFOs too small." + assert all(fifo_reduction_pass), "FIFO tightness test failed, FIFOs too large." diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index 2ad49ae58b..c796ff0d77 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -29,6 +29,7 @@ import pytest +import copy import numpy as np from onnx import TensorProto, helper from qonnx.core.datatype import DataType @@ -47,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): @@ -172,3 +174,61 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType["INT8"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +# param datatype +@pytest.mark.parametrize("pdt", [DataType["INT4"]]) +# folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2]) +# number of input features +@pytest.mark.parametrize("ich", [16]) +# vecs +@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]]) +# function +@pytest.mark.parametrize("func", ["add", "mul"]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_analytical_characterization_channelwise_ops( + direction, idt, act, pdt, nf, ich, func, vecs, exec_mode +): + if nf == -1: + nf = ich + pe = ich // nf + assert ich % pe == 0 + + # generate param data + C = gen_finn_dt_tensor(pdt, (ich)) + + odt = act + + # create model + model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs) + node_details = ("ChannelWiseOp", C, pe, idt, odt, pdt, func, "hls") + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index dc5dc0c02a..8945d6c941 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -29,6 +29,7 @@ import pytest +import copy import numpy as np from onnx import TensorProto, helper from qonnx.core.datatype import DataType @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw): @@ -237,3 +239,126 @@ def test_fpgadataflow_slidingwindow( assert exp_cycles != 0 else: assert model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl" + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["UINT4"]]) +# kernel size +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2, 4]) +# Stride +@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [2, 1]]) +# Dilation +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [2, 1]]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [1, 2, 4]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0, 1]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +# implementation style +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_analytical_characterization_slidingwindow( + direction, + idt, + k, + ifm_dim, + ifm_ch, + stride, + dilation, + exec_mode, + simd, + dw, + parallel_window, + m, + flip, + impl_style, +): + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1): + pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim") + if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)): + pytest.skip("Not all combinations for stride > k edge case supported in default mode") + if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)): + pytest.skip("Parallel window requires SIMD=C for non-depthwise case") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw) + + model = model.transform(to_hw.InferConvInpGen()) + node_details = ( + "ConvolutionInputGenerator", + k, + ifm_ch, + ifm_dim, + ofm_dim, + stride, + dilation, + idt, + dw, + "hls", + ) + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 87e3267186..cb14ae8507 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -29,6 +29,7 @@ import pytest +import copy import numpy as np import os from onnx import TensorProto, helper @@ -49,6 +50,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") test_fpga_part = pynq_part_map[test_pynq_board] @@ -162,3 +164,54 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +# input image dimension +@pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) +# number of rows and number of cols to add +@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]]) +# number of channels +@pytest.mark.parametrize("num_ch", [2, 4]) +# Input parallelism +@pytest.mark.parametrize("simd", [1, 2]) +# FINN input datatype +@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["INT4"]]) +# execution mode +@pytest.mark.parametrize("mode", ["rtlsim"]) +# implementation style +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_analytical_characterization_fmpadding( + direction, idim, pad, num_ch, simd, idt, mode, impl_style +): + if num_ch % simd != 0: + pytest.skip(" num_ch % simd != 0, skipping") + + model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt) + model = model.transform(InferShapes()) + model = model.transform(SetExecMode(mode)) + + node_details = ("FMPadding", idim, pad, num_ch, simd, idt, mode, impl_style) + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index 83ab2ddcaf..241ccdde28 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -29,6 +29,7 @@ import pytest +import copy import numpy as np from onnx import TensorProto, helper from qonnx.core.datatype import DataType @@ -44,7 +45,11 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.test import soft_verify_topk +from finn.util.test import ( + compare_two_chr_funcs, + get_characteristic_fnc, + soft_verify_topk, +) def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style): @@ -136,3 +141,53 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): y = oxe.execute_onnx(model, input_dict)["outp"] assert soft_verify_topk(x, y, k), exec_mode + " failed" + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +@pytest.mark.parametrize("idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]]) +# labels +@pytest.mark.parametrize("labels", [10, 100]) +# folding +@pytest.mark.parametrize("fold", [-1, 2, 10]) +# number of top labels to select +@pytest.mark.parametrize("k", [1, 5]) +# impl style +@pytest.mark.parametrize("impl_style", ["hls"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_analytical_characterization_labelselect( + direction, idt, labels, fold, k, impl_style +): + np.random.seed(0) + if fold == -1: + pe = 1 + else: + pe = labels // fold + assert labels % pe == 0 + + if k == -1: + k = labels + + model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style) + node_details = ("LabelSelect", idt, labels, fold, k, impl_style) + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 1ec77f4eec..a497e5fc2a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -28,6 +28,7 @@ import pytest +import copy import numpy as np import qonnx.custom_op.general.xnorpopcount as xp from onnx import TensorProto, helper @@ -67,6 +68,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -730,3 +732,80 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): assert ( output_matmul == output_mvau_rtl_stitch ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [8]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [8]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [32]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_analytical_characterization_mvau( + direction, mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style +): + if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None): + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) + + node_details = ("MVAU", mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style) + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py index c520fb50fc..50d4ada783 100644 --- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py @@ -29,6 +29,7 @@ import pytest +import copy from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode): @@ -180,3 +182,76 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil # nested for-loops # assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]]) +# 1d maxpool +@pytest.mark.parametrize("dim_1d", [False, True]) +# kernel size +@pytest.mark.parametrize("k", [2, 4]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [4, 10]) +# input channels +@pytest.mark.parametrize("ifm_ch", [1, 3]) +# pe +@pytest.mark.parametrize("pe", [1, 3]) +# ceil mode +@pytest.mark.parametrize("ceil_mode", [1]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_analytical_characterization_streamingmaxpool( + direction, idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode +): + ifm_dim_h = ifm_dim + k_h = k + if dim_1d: + ifm_dim_w = 1 + k_w = 1 + else: + ifm_dim_w = ifm_dim_h + k_w = k_h + ifm_dim = (ifm_dim_h, ifm_dim_w) + k = (k_h, k_w) + + stride_h = k_h + stride_w = k_w + ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, stride_h, 0, ceil_mode) + ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, stride_w, 0, ceil_mode) + ofm_dim = (ofm_dim_h, ofm_dim_w) + if idt == DataType["BIPOLAR"] and dim_1d: + pytest.skip("Skipping binary StreamingMaxPool_1d (not implemented)") + if (ifm_dim_h % k_h != 0 or ifm_dim_w % k_w != 0) and (not dim_1d): + pytest.skip("StreamingMaxPool_2d test w/ ImgDim % PoolDim != 0 not implemented") + if pe > ifm_ch: + pytest.skip("PE cannot be larger than number of input channels") + if pe > 1 and (not dim_1d): + pytest.skip("PE>1 only supported for StreamingMaxPool_1d") + + model = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode) + model = model.transform(InferStreamingMaxPool()) + node_details = ("StreamingMaxPool", k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode, "hls") + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 2079fe7fc5..acc726f039 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -28,6 +28,7 @@ import pytest +import copy import numpy as np from onnx import TensorProto, helper from qonnx.core.datatype import DataType @@ -50,6 +51,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -266,3 +268,155 @@ def test_fpgadataflow_thresholding( exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +@pytest.mark.parametrize("num_input_channels", [6, 16]) +@pytest.mark.parametrize( + "num_input_vecs", + [ + [1], + [1, 2, 2], + ], +) +@pytest.mark.parametrize("activation", [DataType["UINT4"], DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize( + "idt_tdt_cfg", + [ + (DataType["INT8"], DataType["INT8"]), + (DataType["INT8"], DataType["INT9"]), + (DataType["UINT5"], DataType["UINT5"]), + (DataType["UINT5"], DataType["UINT6"]), + ], +) +@pytest.mark.parametrize("fold", [-1, 1, 2]) +@pytest.mark.parametrize("narrow", [True, False]) +@pytest.mark.parametrize("per_tensor", [True, False]) +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +@pytest.mark.parametrize("exec_mode", ["rtlsim"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_analytical_characterization_thresholding( + direction, + num_input_channels, + num_input_vecs, + activation, + idt_tdt_cfg, + fold, + narrow, + per_tensor, + impl_style, + exec_mode, + mem_mode, +): + # the mem_mode parameter can only be used for the hls thresholding + # so the test will only be executed once for impl_style=rtl and once skipped + # when the mem_mode is varied. Otherwise, the same test configuration would always + # run twice. + if impl_style == "rtl" and mem_mode == "internal_decoupled": + pytest.skip( + "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded" + ) + if narrow and activation == DataType["BIPOLAR"]: + pytest.skip("Narrow needs to be false with biploar activation.") + input_data_type, threshold_data_type = idt_tdt_cfg + num_steps = activation.get_num_possible_values() - 1 + + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + if num_input_channels % pe != 0: + pytest.skip("Invalid folding configuration. Skipping test.") + + output_data_type = activation + if activation == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = activation.min() + if narrow and activation.signed(): + activation_bias += 1 + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values( + threshold_data_type, num_input_channels, num_steps, narrow, per_tensor + ) + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + # Make a Multithreshold graph and convert to thresholding binary search node + model = make_single_multithresholding_modelwrapper( + thresholds, + input_data_type, + threshold_data_type, + output_data_type, + activation_bias, + num_input_vecs, + num_input_channels, + ) + + # calculate reference output + x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels])) + + input_dict = {model.graph.input[0].name: x} + y_expected = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name] + + if output_data_type == DataType["BIPOLAR"]: + # binary to bipolar + y_expected = 2 * y_expected - 1 + + model = model.transform(InferThresholdingLayer()) + + # Transform to the specified implementation style, either the + # RTL or HLS according to test parameters + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(InferShapes()) + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + model = model.transform(GiveUniqueNodeNames()) + + if impl_style == "hls": + inst.set_nodeattr("mem_mode", mem_mode) + + node_details = ( + "Thresholding", + thresholds, + input_data_type, + threshold_data_type, + output_data_type, + activation_bias, + num_input_vecs, + num_input_channels, + "hls", + ) + + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc( + model, node_details, test_fpga_part, target_clk_ns, "analytical" + ) + node_rtlsim = get_characteristic_fnc( + model_rtl, node_details, test_fpga_part, target_clk_ns, "rtlsim" + ) + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 236176faa6..6383d5c609 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -28,6 +28,7 @@ import pytest +import copy import numpy as np from onnx import TensorProto, helper from qonnx.core.datatype import DataType @@ -66,6 +67,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.test import compare_two_chr_funcs, get_characteristic_fnc def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels): @@ -468,3 +470,117 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa assert ( golden_out == output_vvau_stitched ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" + + +# which port to test +@pytest.mark.parametrize("direction", ["input", "output"]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]]) +# activation: None or DataType +@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None]) +# PE +@pytest.mark.parametrize("pe", [1, 3, 6]) +# SIMD +@pytest.mark.parametrize("simd", [1, 9]) +# Input image shape +@pytest.mark.parametrize("dim_h", [10]) +@pytest.mark.parametrize("dim_w", [10, 1]) +# Kernel shape +@pytest.mark.parametrize("k_h", [3]) +@pytest.mark.parametrize("k_w", [3, 1]) +# Number of input and output channels +@pytest.mark.parametrize("channels", [3, 6]) +# memory mode +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_analytical_characterization_vvau( + direction, idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode +): + if dim_w == 1 and k_w != 1: + pytest.skip("1D image requires 1D kernel, skipping.") + + if channels % pe != 0: + pytest.skip("Requirement Channels divisable by PE is violated.") + + if (k_h * k_w) % simd != 0: + pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.") + + # Generate weights in expected shape for ONNX and HLS node + W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w)) # shape: [channels, 1, k, k] + + # Generate inputs in expected format for ONNX and HLS node + x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels)) + x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe) + x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5) + x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w) + + if act is None: + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32) + T = np.sort(T, axis=1) + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + (k_h * k_w)) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + + model = _make_single_vvau_modelwrapper( + W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode + ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + node_details = ( + "VVAU", + W, + pe, + simd, + k_h, + k_w, + channels, + dim_h, + dim_w, + wdt, + idt, + odt, + T, + tdt, + mem_mode, + "hls", + ) + part = "xc7z020clg400-1" + target_clk_ns = 4 + allowed_chr_offset_positions = 5 + + model_rtl = copy.deepcopy(model) + node_analytical = get_characteristic_fnc(model, node_details, part, target_clk_ns, "analytical") + node_rtlsim = get_characteristic_fnc(model_rtl, node_details, part, target_clk_ns, "rtlsim") + if direction == "input": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_in"), + node_rtlsim.get_nodeattr("io_chrc_in"), + allowed_chr_offset_positions, + ) + elif direction == "output": + assert compare_two_chr_funcs( + node_analytical.get_nodeattr("io_chrc_out"), + node_rtlsim.get_nodeattr("io_chrc_out"), + allowed_chr_offset_positions, + ) diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py index 8a91a49278..9e206c843a 100644 --- a/tests/transformation/streamline/test_streamline_cnv.py +++ b/tests/transformation/streamline/test_streamline_cnv.py @@ -50,8 +50,6 @@ from finn.util.basic import make_build_dir from finn.util.test import get_test_model_trained -export_onnx_path = make_build_dir("test_streamline_cnv_") - @pytest.mark.streamline # act bits @@ -64,6 +62,7 @@ def test_streamline_cnv(size, wbits, abits): if wbits > abits: pytest.skip("No wbits > abits cases at the moment") nname = "%s_%dW%dA" % (size, wbits, abits) + export_onnx_path = make_build_dir("test_streamline_cnv_") finn_onnx = export_onnx_path + "/%s.onnx" % nname fc = get_test_model_trained(size, wbits, abits) export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx) diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py index edc4a96fe2..9ce2f2ab65 100644 --- a/tests/transformation/streamline/test_streamline_fc.py +++ b/tests/transformation/streamline/test_streamline_fc.py @@ -52,8 +52,6 @@ from finn.util.basic import make_build_dir from finn.util.test import get_test_model_trained -export_onnx_path = make_build_dir("test_streamline_fc_") - @pytest.mark.streamline # act bits @@ -68,6 +66,7 @@ def test_streamline_fc(size, wbits, abits): if wbits > abits: pytest.skip("No wbits > abits cases at the moment") nname = "%s_%dW%dA" % (size, wbits, abits) + export_onnx_path = make_build_dir("test_streamline_fc_") finn_onnx = export_onnx_path + "/%s.onnx" % nname fc = get_test_model_trained(size, wbits, abits) export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx)