diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 5126ed3ff4..012da634f0 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -65,12 +65,18 @@ RUN apt-get update && \
python-is-python3 \
python3-pip \
python3-setuptools-scm \
- python3-venv
+ python3-venv \
+ pybind11-dev \
+ libfmt-dev \
+ libboost-dev \
+ libjansson-dev \
+ libgetdata-dev \
+ libtinfo5
RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
RUN locale-gen "en_US.UTF-8"
# install Verilator from source to get the right version
-RUN apt-get install -y git perl make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev
+RUN apt-get install -y git perl make autoconf g++-10 flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev
RUN git clone https://github.com/verilator/verilator
RUN cd verilator && \
git checkout v4.224 && \
@@ -95,7 +101,7 @@ RUN pip install -r /tmp/requirements.txt
RUN rm /tmp/requirements.txt
# install PyTorch
-RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+RUN pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --extra-index-url https://download.pytorch.org/whl/cu121
# extra Python package dependencies (for testing and interaction)
RUN pip install pygments==2.14.0
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index c7500bcaa6..26a3388efd 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -59,12 +59,13 @@ recho () {
mv ${FINN_ROOT}/deps/qonnx/pyproject.toml ${FINN_ROOT}/deps/qonnx/pyproject.tmp
pip install --user -e ${FINN_ROOT}/deps/qonnx
mv ${FINN_ROOT}/deps/qonnx/pyproject.tmp ${FINN_ROOT}/deps/qonnx/pyproject.toml
-# finn-experimental
-pip install --user -e ${FINN_ROOT}/deps/finn-experimental
-# brevitas
-pip install --user -e ${FINN_ROOT}/deps/brevitas
-# pyverilator
-pip install --user -e ${FINN_ROOT}/deps/pyverilator
+
+cat <(tail -n +3 python_repos.txt) | while IFS=',' read -a arr ; do
+ # extract line to $arr as array separated by ','
+ pip install --user -e ${FINN_ROOT}/deps/"${arr[0]}"
+done
+
+
if [ -f "${FINN_ROOT}/setup.py" ];then
# run pip install for finn
@@ -87,7 +88,7 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then
gecho "Found XRT at $XILINX_XRT"
else
recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
- exit -1
+ #exit -1
fi
else
yecho "Unable to find $VITIS_PATH/settings64.sh"
@@ -105,6 +106,22 @@ else
fi
fi
+if [ -z "${XILINX_VIVADO}" ]; then
+ yecho "pyxsi will be unavailable since Vivado was not found"
+else
+ if [ -f "${FINN_ROOT}/deps/pyxsi/pyxsi.so" ]; then
+ gecho "Found pyxsi at ${FINN_ROOT}/deps/pyxsi/pyxsi.so"
+ else
+ OLDPWD=$(pwd)
+ cd ${FINN_ROOT}/deps/pyxsi
+ touch .dockerenv
+ make
+ cd $OLDPWD
+ fi
+ export PYTHONPATH=$PYTHONPATH:${FINN_ROOT}/deps/pyxsi:${FINN_ROOT}/deps/pyxsi/py
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lib/x86_64-linux-gnu/:${XILINX_VIVADO}/lib/lnx64.o
+fi
+
if [ -f "$HLS_PATH/settings64.sh" ];then
# source Vitis HLS env.vars
source $HLS_PATH/settings64.sh
@@ -129,6 +146,7 @@ if [ -d "$FINN_ROOT/.Xilinx" ]; then
mkdir "$HOME/.Xilinx/Vivado/"
cp "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" "$HOME/.Xilinx/Vivado/"
gecho "Found Vivado_init.tcl and copied to $HOME/.Xilinx/Vivado/Vivado_init.tcl"
+
else
yecho "Unable to find $FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl"
fi
@@ -137,6 +155,9 @@ else
echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
fi
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$VITIS_PATH/lnx64/tools/fpo_v7_1"
+
export PATH=$PATH:$HOME/.local/bin
+
# execute the provided command(s) as root
exec "$@"
diff --git a/fetch-repos.sh b/fetch-repos.sh
index a4fc124fa4..081b3a470d 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,35 +27,25 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
-FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
-BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
-PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
-CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+CNPY_COMMIT="8c82362372ce600bbd1cf11d64661ab69d38d7de"
+HLSLIB_COMMIT="7783acaac835e702da25aa6b7103254b3cbcdf83"
OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
RFSOC4x2_BDF_COMMIT="13fb6f6c02c7dfd7e4b336b18b959ad5115db696"
KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79"
EXP_BOARD_FILES_MD5="226ca927a16ea4ce579f1332675e9e9a"
+PYXSI_COMMIT="28051f8dad7644614fc50dc755d1def9e45fc97b"
-QONNX_URL="https://github.com/fastmachinelearning/qonnx.git"
-FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git"
-BREVITAS_URL="https://github.com/Xilinx/brevitas.git"
-PYVERILATOR_URL="https://github.com/maltanar/pyverilator.git"
-CNPY_URL="https://github.com/rogersce/cnpy.git"
+CNPY_URL="https://github.com/maltanar/cnpy.git"
HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
RFSOC4x2_BDF_URL="https://github.com/RealDigitalOrg/RFSoC4x2-BSP.git"
KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
+PYXSI_URL="https://github.com/maltanar/pyxsi.git"
-QONNX_DIR="qonnx"
-FINN_EXP_DIR="finn-experimental"
-BREVITAS_DIR="brevitas"
-PYVERILATOR_DIR="pyverilator"
CNPY_DIR="cnpy"
HLSLIB_DIR="finn-hlslib"
OMX_DIR="oh-my-xilinx"
@@ -63,6 +53,7 @@ AVNET_BDF_DIR="avnet-bdf"
XIL_BDF_DIR="xil-bdf"
RFSOC4x2_BDF_DIR="rfsoc4x2-bdf"
KV260_SOM_BDF_DIR="kv260-som-bdf"
+PYXSI_DIR="pyxsi"
# absolute path to this script, e.g. /home/user/bin/foo.sh
SCRIPT=$(readlink -f "$0")
@@ -115,10 +106,12 @@ fetch_board_files() {
cd $OLD_PWD
}
-fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR
-fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR
-fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR
-fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR
+
+cat <(tail -n +2 python_repos.txt) | while IFS=',' read -a arr ; do
+ # extract line to $arr as array separated by ','
+ fetch_repo "${arr[1]}" "${arr[2]}" "${arr[0]}"
+done
+
fetch_repo $CNPY_URL $CNPY_COMMIT $CNPY_DIR
fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
@@ -126,6 +119,7 @@ fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR
fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR
fetch_repo $RFSOC4x2_BDF_URL $RFSOC4x2_BDF_COMMIT $RFSOC4x2_BDF_DIR
fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR
+fetch_repo $PYXSI_URL $PYXSI_COMMIT $PYXSI_DIR
# Can skip downloading of board files entirely if desired
if [ "$FINN_SKIP_BOARD_FILES" = "1" ]; then
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 722da1d803..9d19ebbaf8 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -5,6 +5,36 @@
memstream
1.0
+
+ ap_clk
+
+
+
+
+
+
+ CLK
+
+
+ ap_clk
+
+
+
+
+
+ ASSOCIATED_RESET
+ ap_rst_n
+
+
+ ASSOCIATED_BUSIF
+ m_axis_0:s_axilite
+
+
+ FREQ_TOLERANCE_HZ
+ -1
+
+
+
m_axis_0
@@ -42,7 +72,7 @@
-
+
@@ -222,7 +252,7 @@
- ap_clk
+ ap_clk2x
@@ -232,30 +262,26 @@
CLK
- ap_clk
+ ap_clk2x
ASSOCIATED_RESET
- ap_rst_n
-
-
- ASSOCIATED_BUSIF
- m_axis_0:s_axilite
+ ap_rst_n
FREQ_TOLERANCE_HZ
- -1
+ -1
- interface_aximm
- interface_aximm
+ s_axilite
+ s_axilite
reg0
reg0
@@ -272,7 +298,7 @@
xilinx_anylanguagesynthesis
Synthesis
:vivado.xilinx.com:synthesis
- SystemVerilog
+ Verilog
memstream_axi_wrapper
xilinx_anylanguagesynthesis_view_fileset
@@ -280,7 +306,7 @@
viewChecksum
- 04464096
+ 95b1241c
@@ -288,7 +314,7 @@
xilinx_anylanguagebehavioralsimulation
Simulation
:vivado.xilinx.com:simulation
- SystemVerilog
+ Verilog
memstream_axi_wrapper
xilinx_anylanguagebehavioralsimulation_view_fileset
@@ -296,19 +322,7 @@
viewChecksum
- 9e058959
-
-
-
-
- xilinx_implementation
- Implementation
- :vivado.xilinx.com:implementation
- memstream_axi_wrapper
-
-
- viewChecksum
- cd434062
+ 95b1241c
@@ -322,7 +336,7 @@
viewChecksum
- 6c92393d
+ 35708916
@@ -336,7 +350,7 @@
viewChecksum
- 923e7b90
+ 09540bf8
@@ -355,6 +369,19 @@
+
+ ap_clk2x
+
+ in
+
+
+ std_logic
+ xilinx_anylanguagesynthesis
+ xilinx_anylanguagebehavioralsimulation
+
+
+
+
ap_rst_n
@@ -752,6 +779,11 @@
Ram Style
auto
+
+ PUMPED_MEMORY
+ Pumped Memory
+ false
+
AXILITE_ADDR_WIDTH
Axilite Addr Width
@@ -769,10 +801,6 @@
xilinx_anylanguagesynthesis_view_fileset
-
- hdl/axilite_if.v
- verilogSource
-
hdl/memstream.sv
systemVerilogSource
@@ -784,7 +812,11 @@
hdl/memstream_axi_wrapper.v
verilogSource
- CHECKSUM_7caabca7
+
+
+ hdl/axilite_if.v
+ verilogSource
+ CHECKSUM_69d1ba26
@@ -792,26 +824,19 @@
hdl/memstream.sv
systemVerilogSource
- USED_IN_ipstatic
- xil_defaultlib
hdl/memstream_axi.sv
systemVerilogSource
- USED_IN_ipstatic
- xil_defaultlib
- hdl/axilite_if.v
+ hdl/memstream_axi_wrapper.v
verilogSource
- USED_IN_ipstatic
- xil_defaultlib
- hdl/memstream_axi_wrapper.v
+ hdl/axilite_if.v
verilogSource
USED_IN_ipstatic
- xil_defaultlib
@@ -819,7 +844,7 @@
xgui/memstream_v1_0.tcl
tclSource
- CHECKSUM_32cad48d
+ CHECKSUM_35708916
XGUI_VERSION_2
@@ -869,9 +894,41 @@
Component_Name
memstream_axi_wrapper_v1_0
+
+ PUMPED_MEMORY
+ Pumped Memory
+ false
+
+
+ virtex7
+ qvirtex7
+ versal
+ kintex7
+ kintex7l
+ qkintex7
+ qkintex7l
+ akintex7
+ artix7
+ artix7l
+ aartix7
+ qartix7
+ zynq
+ qzynq
+ azynq
+ spartan7
+ aspartan7
+ virtexu
+ zynquplus
+ virtexuplus
+ virtexuplusHBM
+ virtexuplus58g
+ kintexuplus
+ artixuplus
+ kintexu
+
/UserIP
@@ -879,23 +936,23 @@
level_1
package_project
AMD
- 5
+ 3
user.org:user:memstream_axi_wrapper:1.0
- 2023-05-24T06:34:57Z
+ 2023-12-13T15:36:23Z
- 2022.2
-
-
-
-
+ 2022.1
+
+
+
+
-
+
diff --git a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
index 271f9df453..3c34422cac 100644
--- a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
+++ b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
@@ -1,2 +1,2 @@
# This file is automatically written. Do not modify.
-proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {DEPTH WIDTH } {expr 2 + ceil(log($DEPTH*pow(2, ceil(log(($WIDTH+31)/32)/log(2))))/log(2))}
+proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {DEPTH WIDTH } {expr ceil(log($DEPTH*(2**ceil( log(($WIDTH+31)/32)/log(2) )))/log(2)) + 2}
diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
deleted file mode 100644
index 11cef604e0..0000000000
--- a/finn-rtllib/memstream/hdl/Q_srl.v
+++ /dev/null
@@ -1,308 +0,0 @@
-// original source:
-// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v
-
-
-// Copyright (c) 1999 The Regents of the University of California
-// Copyright (c) 2010 The Regents of the University of Pennsylvania
-// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London
-// Copyright (c) 2020 Xilinx
-//
-// Permission to use, copy, modify, and distribute this software and
-// its documentation for any purpose, without fee, and without a
-// written agreement is hereby granted, provided that the above copyright
-// notice and this paragraph and the following two paragraphs appear in
-// all copies.
-//
-// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
-// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
-// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
-// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-//
-// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON
-// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
-// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-//
-
-// Q_srl_oreg3_prefull_SIMPLE.v
-//
-// - In-page queue with parameterizable depth, bit width
-// - Stream I/O is triple (data, valid, back-pressure),
-// with EOS concatenated into the data
-// - Flow control for input & output is combinationally decoupled
-// - 2 <= depth <= 256
-// * (depth >= 2) is required to decouple I/O flow control,
-// where empty => no produce, full => no consume,
-// and depth 1 would ping-pong between the two at half rate
-// * (depth <= 256) can be modified
-// by changing ''synthesis loop_limit X'' below
-// and changing ''addrwidth'' or its log computation
-// - 1 <= width
-// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice,
-// plus output register (for fast output)
-// - Queue addressing is done by ''addr'' up-down counter
-// - Queue fullness is checked by comparator (addr==depth)
-// - Queue fullness is pre-computed for next cycle
-// - Queue input back-pressure is pre-computed for next cycle
-// - Queue output valid (state!=state__empty) is pre-computed for next cycle
-// (necessary since SRL data output reg requires non-boolean state)
-// - FSM has 3 states (empty, one, more)
-// - When empty, continue to emit most recently emitted value (for debugging)
-//
-// - Queue slots used = / (state==state_empty) ? 0
-// | (state==state_one) ? 1
-// \ (state==state_more) ? addr+2
-// - Queue slots used <= depth
-// - Queue slots remaining = depth - used
-// = / (state==state_empty) ? depth
-// | (state==state_one) ? depth-1
-// \ (state==state_more) ? depth-2-addr
-//
-// - Synplify 7.1 / 8.0
-// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05
-
-
-`ifdef Q_srl
-`else
-`define Q_srl
-
-
-module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
-
- parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256)
- parameter width = 16; // - width of data (i_d, o_d)
-
- parameter addrwidth = $clog2(depth);
-
- input clock;
- input reset;
-
- input [width-1:0] i_d; // - input stream data (concat data + eos)
- input i_v; // - input stream valid
- output i_r; // - input stream ready
- wire i_b; // - input stream back-pressure
-
- output [width-1:0] o_d; // - output stream data (concat data + eos)
- output o_v; // - output stream valid
- input o_r; // - output stream ready
- wire o_b; // - output stream back-pressure
-
- output [addrwidth:0] count; // - output number of elems in queue
- output [addrwidth:0] maxcount; // - maximum observed count since reset
-
- reg [addrwidth:0] maxcount_reg; // - maximum count seen until now
- reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address
- // for data output
- reg shift_en_; // - SRL16 shift enable
- reg [width-1:0] srl [depth-2:0]; // - SRL16 memory
- reg shift_en_o_; // - SRLO shift enable
- reg [width-1:0] srlo_, srlo // - SRLO output reg
- /* synthesis syn_allow_retiming=0 */ ;
-
- parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED
- parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo
- parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo
- // #items in srl = addr+2
-
- reg [1:0] state, state_; // - state register
-
- wire addr_full_; // - true iff addr==depth-2 on NEXT cycle
- reg addr_full; // - true iff addr==depth-2
- wire addr_zero_; // - true iff addr==0
- wire o_v_reg_; // - true iff state_empty on NEXT cycle
- reg o_v_reg // - true iff state_empty
- /* synthesis syn_allow_retiming=0 */ ;
- wire i_b_reg_; // - true iff !full on NEXT cycle
- reg i_b_reg // - true iff !full
- /* synthesis syn_allow_retiming=0 */ ;
-
- assign addr_full_ = (state_==state_more) && (addr_==depth-2);
- // - queue full
- assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0)
- assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty
- assign i_b_reg_ = addr_full_; // - input bp if full
- assign o_d = srlo; // - output data from queue
- assign o_v = o_v_reg; // - output valid if non-empty
- assign i_b = i_b_reg; // - input bp if full
- assign maxcount = maxcount_reg;
-
- assign i_r = !i_b;
- assign o_b = !o_r;
-
- assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0));
-
- // - ''always'' block with both FFs and SRL16 does not work,
- // since FFs need reset but SRL16 does not
-
- always @(posedge clock) begin // - seq always: FFs
- if (reset) begin
- state <= state_empty;
- addr <= 0;
- addr_full <= 0;
- o_v_reg <= 0;
-
- i_b_reg <= 0;
- maxcount_reg <= 0;
-
- end
- else begin
- state <= state_;
- addr <= addr_;
- addr_full <= addr_full_;
- o_v_reg <= o_v_reg_;
- i_b_reg <= i_b_reg_;
- maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
- end
- end // always @ (posedge clock)
-
- always @(posedge clock) begin // - seq always: srlo
- // - infer enabled output reg at end of shift chain
- // - input first element from i_d, all subsequent elements from SRL16
- if (reset) begin
- srlo <= 0;
- end
- else begin
- if (shift_en_o_) begin
- srlo <= srlo_;
- end
- end
- end // always @ (posedge clock)
-
- always @(posedge clock) begin // - seq always: srl
- // - infer enabled SRL16E from shifting srl array
- // - no reset capability; srl[] contents undefined on reset
- if (shift_en_) begin
- // synthesis loop_limit 256
- for (a_=depth-2; a_>0; a_=a_-1) begin
- srl[a_] = srl[a_-1];
- end
- srl[0] <= i_d;
- end
- end // always @ (posedge clock or negedge reset)
-
- always @* begin // - combi always
- srlo_ <= 'bx;
- shift_en_o_ <= 1'bx;
- shift_en_ <= 1'bx;
- addr_ <= 'bx;
- state_ <= 2'bx;
- case (state)
-
- state_empty: begin // - (empty, will not produce)
- if (i_v) begin // - empty & i_v => consume
- srlo_ <= i_d;
- shift_en_o_ <= 1;
- shift_en_ <= 1'bx;
- addr_ <= 0;
- state_ <= state_one;
- end
- else begin // - empty & !i_v => idle
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 1'bx;
- addr_ <= 0;
- state_ <= state_empty;
- end
- end
-
- state_one: begin // - (contains one)
- if (i_v && o_b) begin // - one & i_v & o_b => consume
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 1;
- addr_ <= 0;
- state_ <= state_more;
- end
- else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod
- srlo_ <= i_d;
- shift_en_o_ <= 1;
- shift_en_ <= 1;
- addr_ <= 0;
- state_ <= state_one;
- end
- else if (!i_v && o_b) begin // - one & !i_v & o_b => idle
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 1'bx;
- addr_ <= 0;
- state_ <= state_one;
- end
- else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 1'bx;
- addr_ <= 0;
- state_ <= state_empty;
- end
- end // case: state_one
-
- state_more: begin // - (contains more than one)
- if (addr_full || (depth==2)) begin
- // - (full, will not consume)
- // - (full here if depth==2)
- if (o_b) begin // - full & o_b => idle
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 0;
- addr_ <= addr;
- state_ <= state_more;
- end
- else begin // - full & !o_b => produce
- srlo_ <= srl[addr];
- shift_en_o_ <= 1;
- shift_en_ <= 0;
-// addr_ <= addr-1;
-// state_ <= state_more;
- addr_ <= addr_zero_ ? 0 : addr-1;
- state_ <= addr_zero_ ? state_one : state_more;
- end
- end
- else begin // - (mid: neither empty nor full)
- if (i_v && o_b) begin // - mid & i_v & o_b => consume
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 1;
- addr_ <= addr+1;
- state_ <= state_more;
- end
- else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod
- srlo_ <= srl[addr];
- shift_en_o_ <= 1;
- shift_en_ <= 1;
- addr_ <= addr;
- state_ <= state_more;
- end
- else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle
- srlo_ <= 'bx;
- shift_en_o_ <= 0;
- shift_en_ <= 0;
- addr_ <= addr;
- state_ <= state_more;
- end
- else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce
- srlo_ <= srl[addr];
- shift_en_o_ <= 1;
- shift_en_ <= 0;
- addr_ <= addr_zero_ ? 0 : addr-1;
- state_ <= addr_zero_ ? state_one : state_more;
- end
- end // else: !if(addr_full)
- end // case: state_more
-
- default: begin
- srlo_ <= 'bx;
- shift_en_o_ <= 1'bx;
- shift_en_ <= 1'bx;
- addr_ <= 'bx;
- state_ <= 2'bx;
- end // case: default
-
- endcase // case(state)
- end // always @ *
-
-endmodule // Q_srl
-
-
-`endif // `ifdef Q_srl
diff --git a/finn-rtllib/memstream/hdl/memstream.sv b/finn-rtllib/memstream/hdl/memstream.sv
index 9cbef493a3..eeb6d571c4 100644
--- a/finn-rtllib/memstream/hdl/memstream.sv
+++ b/finn-rtllib/memstream/hdl/memstream.sv
@@ -129,7 +129,7 @@ module memstream #(
// Stage #2: Memory Access
logic Rb2 = 0;
logic Rs2 = 0;
- data_t Data2 = 'x;
+ data_t Data2;
if(1) begin : blkStage2
(* RAM_STYLE = RAM_STYLE *)
data_t Mem[DEPTH];
@@ -139,13 +139,58 @@ module memstream #(
// Execute Memory Operation
uwire addr_t addr = Ptr[1].val;
+ data_t RdOut;
always_ff @(posedge clk) begin
if(en) begin
+ // NO_CHANGE mode as READ and WRITE never happen together.
if(Wr1) Mem[addr] <= Data1;
- Data2 <= Mem[addr];
+ else RdOut <= Mem[addr];
end
end
+ // Stretch by Additional Pipeline Stages for Targetting URAM
+ localparam bit STRETCH = (RAM_STYLE == "ultra") || (RAM_STYLE == "ULTRA");
+
+ uwire logic irb = Rb1;
+ uwire logic irs = Rs1 && !rollback;
+ uwire ptr_t iptr = Ptr[1];
+ uwire logic orb;
+ uwire logic ors;
+ uwire ptr_t optr;
+
+ if(!STRETCH) begin
+ assign orb = irb;
+ assign ors = irs;
+ assign optr = iptr;
+
+ assign Data2 = RdOut;
+ end
+ else begin
+ logic SRb = 0;
+ logic SRs = 0;
+ ptr_t SPtr = '{ default: 'x };
+ data_t SData = 'x;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ SRb <= 0;
+ SRs <= 0;
+ SPtr <= '{ default: 'x };
+ SData <= 'x;
+ end
+ else if(en) begin
+ SRb <= irb;
+ SRs <= irs;
+ SPtr <= iptr;
+ SData <= RdOut;
+ end
+ end
+ assign orb = SRb;
+ assign ors = SRs && !rollback;
+ assign optr = SPtr;
+
+ assign Data2 = SData;
+ end
+
// Copy Output Designation
always_ff @(posedge clk) begin
if(rst) begin
@@ -154,9 +199,9 @@ module memstream #(
Ptr[2] <= '{ default: 'x };
end
else if(en) begin
- Rb2 <= Rb1;
- Rs2 <= Rs1 && !rollback;
- Ptr[2] <= Ptr[1];
+ Rb2 <= orb;
+ Rs2 <= ors;
+ Ptr[2] <= optr;
end
end
end : blkStage2
diff --git a/finn-rtllib/memstream/hdl/memstream_axi.sv b/finn-rtllib/memstream/hdl/memstream_axi.sv
index 136bcb1d7e..7f9b7b47b0 100644
--- a/finn-rtllib/memstream/hdl/memstream_axi.sv
+++ b/finn-rtllib/memstream/hdl/memstream_axi.sv
@@ -36,11 +36,13 @@ module memstream_axi #(
parameter INIT_FILE = "",
parameter RAM_STYLE = "auto",
+ bit PUMPED_MEMORY = 0,
localparam int unsigned AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
)(
// Global Control
input logic clk,
+ input logic clk2x,
input logic rst,
// AXI-lite Write
@@ -110,25 +112,152 @@ module memstream_axi #(
//-----------------------------------------------------------------------
// Streaming Memory Backend
- memstream #(
- .DEPTH(DEPTH),
- .WIDTH(WIDTH),
- .INIT_FILE(INIT_FILE),
- .RAM_STYLE(RAM_STYLE)
- ) mem (
- .clk, .rst,
-
- .config_address,
- .config_ce,
- .config_we,
- .config_d0,
- .config_q0,
- .config_rack,
-
- .ordy(m_axis_0_tready),
- .ovld(m_axis_0_tvalid),
- .odat(m_axis_0_tdata[WIDTH-1:0])
- );
+ localparam int unsigned DEPTH_EFF = PUMPED_MEMORY? 2*DEPTH : DEPTH;
+ localparam int unsigned WIDTH_EFF = PUMPED_MEMORY? (WIDTH+1)/2 : WIDTH;
+ uwire mem_ce;
+ uwire mem_we;
+ uwire [ 31:0] mem_a0;
+ uwire [WIDTH_EFF-1:0] mem_d0;
+ uwire mem_rack;
+ uwire [WIDTH_EFF-1:0] mem_q0;
+ uwire mem_rdy;
+ uwire mem_vld;
+ uwire [WIDTH_EFF-1:0] mem_dat;
+ if(!PUMPED_MEMORY) begin : genUnpumped
+ assign mem_ce = config_ce;
+ assign mem_we = config_we;
+ assign mem_a0 = config_address;
+ assign mem_d0 = config_d0;
+ assign config_rack = mem_rack;
+ assign config_q0 = mem_q0;
+
+ assign mem_rdy = m_axis_0_tready;
+ assign m_axis_0_tvalid = mem_vld;
+ assign m_axis_0_tdata = mem_dat;
+
+ memstream #(
+ .DEPTH(DEPTH_EFF),
+ .WIDTH(WIDTH_EFF),
+ .INIT_FILE(INIT_FILE),
+ .RAM_STYLE(RAM_STYLE)
+ ) mem (
+ .clk(clk), .rst,
+
+ .config_address(mem_a0),
+ .config_ce(mem_ce),
+ .config_we(mem_we),
+ .config_d0(mem_d0),
+ .config_q0(mem_q0),
+ .config_rack(mem_rack),
+
+ .ordy(mem_rdy),
+ .ovld(mem_vld),
+ .odat(mem_dat)
+ );
+ end : genUnpumped
+ else begin : genPumped
+
+ // Identifier of fast active clock edge coinciding with slow active clock edge
+ logic Active;
+ always_ff @(posedge clk2x) begin
+ if(rst) Active <= 0;
+ else Active <= !Active;
+ end
+
+ // Clock translation for config requests, which are spread across two fast cycles
+ logic Cfg2x_CE = 0;
+ logic Cfg2x_WE = 'x;
+ logic [30 :0] Cfg2x_A0 = 'x;
+ logic [WIDTH-1:0] Cfg2x_D0 = 'x;
+ always_ff @(posedge clk2x) begin
+ if(rst) begin
+ Cfg2x_CE <= 0;
+ Cfg2x_WE <= 'x;
+ Cfg2x_A0 <= 'x;
+ Cfg2x_D0 <= 'x;
+ end
+ else begin
+ if(Active) begin
+ Cfg2x_CE <= config_ce;
+ Cfg2x_WE <= config_we;
+ Cfg2x_A0 <= config_address;
+ end
+ Cfg2x_D0 <= Active? config_d0 : { {(WIDTH-WIDTH_EFF){1'bx}}, Cfg2x_D0[WIDTH-1:WIDTH_EFF] };
+ end
+ end
+ assign mem_ce = Cfg2x_CE;
+ assign mem_we = Cfg2x_WE;
+ assign mem_a0 = { Cfg2x_A0, Active };
+ assign mem_d0 = Cfg2x_D0;
+
+ // Assemble two consecutive read replies into one
+ logic [1:0] Cfg2x_Rack = 0;
+ logic [2*WIDTH_EFF-1:0] Cfg2x_Q0 = 'x;
+ always_ff @(posedge clk2x) begin
+ if(rst) begin
+ Cfg2x_Rack <= 0;
+ Cfg2x_Q0 <= 'x;
+ end
+ else begin
+ if(mem_rack) Cfg2x_Q0 <= { mem_q0, Cfg2x_Q0[WIDTH_EFF+:WIDTH_EFF] };
+ // Count replies and clear when seen in slow clock domain
+ Cfg2x_Rack <= Cfg2x_Rack + mem_rack;
+ if(Cfg2x_Rack[1] && Active) Cfg2x_Rack <= 0;
+ end
+ end
+ assign config_rack = Cfg2x_Rack[1];
+ assign config_q0 = Cfg2x_Q0[WIDTH-1:0];
+
+ // Assemble two consecutive stream outputs into one
+ logic [3:0][WIDTH_EFF-1:0] SBuf = 'x;
+ logic [2:0] SCnt = 0; // 0..4
+ logic SVld = 0;
+ always_ff @(posedge clk2x) begin
+ if(rst) begin
+ SBuf <= 'x;
+ SCnt <= 0;
+ SVld <= 0;
+ end
+ else begin
+ automatic logic [4:0][WIDTH_EFF-1:0] sbuf = { {WIDTH_EFF{1'bx}}, SBuf };
+ automatic logic [2:0] scnt = SCnt;
+
+ sbuf[scnt] = mem_dat;
+ if(m_axis_0_tvalid && (Active && m_axis_0_tready)) begin
+ scnt[2:1] = { 1'b0, scnt[2] };
+ sbuf[1:0] = sbuf[3:2];
+ end
+ scnt += mem_rdy && mem_vld;
+
+ SBuf <= sbuf[3:0];
+ SCnt <= scnt;
+ if(Active) SVld <= |scnt[2:1];
+ end
+ end
+ assign mem_rdy = !SCnt[2];
+ assign m_axis_0_tvalid = SVld;
+ assign m_axis_0_tdata = { SBuf[1][0+:WIDTH-WIDTH_EFF], SBuf[0] };
+
+ memstream #(
+ .DEPTH(DEPTH_EFF),
+ .WIDTH(WIDTH_EFF),
+ .INIT_FILE(INIT_FILE),
+ .RAM_STYLE(RAM_STYLE)
+ ) mem (
+ .clk(clk2x), .rst,
+
+ .config_address(mem_a0),
+ .config_ce(mem_ce),
+ .config_we(mem_we),
+ .config_d0(mem_d0),
+ .config_q0(mem_q0),
+ .config_rack(mem_rack),
+
+ .ordy(mem_rdy),
+ .ovld(mem_vld),
+ .odat(mem_dat)
+ );
+ end : genPumped
if($bits(m_axis_0_tdata) > WIDTH) begin
assign m_axis_0_tdata[$left(m_axis_0_tdata):WIDTH] = '0;
end
diff --git a/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
index 13f5c82d6e..692720fc2d 100644
--- a/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
+++ b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
@@ -36,6 +36,7 @@ module memstream_axi_wrapper #(
parameter INIT_FILE = "",
parameter RAM_STYLE = "auto",
+ parameter PUMPED_MEMORY = 0,
parameter AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
)(
@@ -43,6 +44,8 @@ module memstream_axi_wrapper #(
(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF m_axis_0, ASSOCIATED_RESET ap_rst_n" *)
(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
input ap_clk,
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+ input ap_clk2x,
(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
input ap_rst_n,
@@ -78,18 +81,18 @@ module memstream_axi_wrapper #(
output [((WIDTH+7)/8)*8-1:0] m_axis_0_tdata
);
- localparam INIT_FILTERED =
-`ifdef SYNTHESIS
- RAM_STYLE == "ultra"? "" :
-`endif
- INIT_FILE;
+ // Used to be set to "" when targeting pre-Versal
+ // URAMs to avoid synth errors, temporarily disabled
+ // TODO add appropriate define check here for Versal
+ localparam INIT_FILTERED = INIT_FILE;
memstream_axi #(
.DEPTH(DEPTH), .WIDTH(WIDTH),
.INIT_FILE(INIT_FILTERED),
- .RAM_STYLE(RAM_STYLE)
+ .RAM_STYLE(RAM_STYLE),
+ .PUMPED_MEMORY(PUMPED_MEMORY)
) core (
- .clk(ap_clk), .rst(!ap_rst_n),
+ .clk(ap_clk), .clk2x(ap_clk2x), .rst(!ap_rst_n),
// AXI-lite Write
.awready(awready),
diff --git a/finn-rtllib/memstream/hdl/memstream_wrapper_template.v b/finn-rtllib/memstream/hdl/memstream_wrapper_template.v
new file mode 100644
index 0000000000..e48fd35f9b
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_wrapper_template.v
@@ -0,0 +1,125 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+module $MODULE_NAME$_memstream_wrapper #(
+ parameter DEPTH = $DEPTH$,
+ parameter WIDTH = $WIDTH$,
+
+ parameter INIT_FILE = "$INIT_FILE$",
+ parameter RAM_STYLE = "$RAM_STYLE$",
+ parameter PUMPED_MEMORY = $PUMPED_MEMORY$,
+
+ parameter AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
+)(
+ // Global Control
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF m_axis_0, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+ input ap_clk,
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+ input ap_clk2x,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+ input ap_rst_n,
+
+ // AXI-lite Write
+ output awready,
+ input awvalid,
+ input [2:0] awprot,
+ input [AXILITE_ADDR_WIDTH-1:0] awaddr,
+
+ output wready,
+ input wvalid,
+ input [31:0] wdata,
+ input [ 3:0] wstrb,
+
+ input bready,
+ output bvalid,
+ output [1:0] bresp,
+
+ // AXI-lite Read
+ output arready,
+ input arvalid,
+ input [2:0] arprot,
+ input [AXILITE_ADDR_WIDTH-1:0] araddr,
+
+ input rready,
+ output rvalid,
+ output [ 1:0] rresp,
+ output [31:0] rdata,
+
+ // Continuous output stream
+ input m_axis_0_tready,
+ output m_axis_0_tvalid,
+ output [((WIDTH+7)/8)*8-1:0] m_axis_0_tdata
+);
+
+ // Used to be set to "" when targeting pre-Versal
+ // URAMs to avoid synth errors, temporarily disabled
+ // TODO add appropriate define check here for Versal
+ localparam INIT_FILTERED = INIT_FILE;
+
+ memstream_axi #(
+ .DEPTH(DEPTH), .WIDTH(WIDTH),
+ .INIT_FILE(INIT_FILTERED),
+ .RAM_STYLE(RAM_STYLE),
+ .PUMPED_MEMORY(PUMPED_MEMORY)
+ ) core (
+ .clk(ap_clk), .clk2x(ap_clk2x), .rst(!ap_rst_n),
+
+ // AXI-lite Write
+ .awready(awready),
+ .awvalid(awvalid),
+ .awprot(awprot),
+ .awaddr(awaddr),
+ .wready(wready),
+ .wvalid(wvalid),
+ .wdata(wdata),
+ .wstrb(wstrb),
+ .bready(bready),
+ .bvalid(bvalid),
+ .bresp(bresp),
+
+ // AXI-lite Read
+ .arready(arready),
+ .arvalid(arvalid),
+ .arprot(arprot),
+ .araddr(araddr),
+ .rready(rready),
+ .rvalid(rvalid),
+ .rresp(rresp),
+ .rdata(rdata),
+
+ // Continuous output stream
+ .m_axis_0_tready(m_axis_0_tready),
+ .m_axis_0_tvalid(m_axis_0_tvalid),
+ .m_axis_0_tdata(m_axis_0_tdata)
+ );
+
+endmodule : $MODULE_NAME$_memstream_wrapper
diff --git a/finn-rtllib/memstream/sim/memstream_axi_tb.sv b/finn-rtllib/memstream/sim/memstream_axi_tb.sv
new file mode 100644
index 0000000000..ea0ea21f84
--- /dev/null
+++ b/finn-rtllib/memstream/sim/memstream_axi_tb.sv
@@ -0,0 +1,223 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author Thomas B. Preußer
+ */
+
+module memstream_axi_tb;
+ localparam int unsigned DEPTH = 1024;
+ localparam int unsigned WIDTH = 32;
+ localparam bit PUMPED_MEMORY = 1;
+
+ localparam int unsigned AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2;
+
+ //- Global Control ------------------
+ logic clk = 1;
+ logic clk2x = 1;
+ always #5ns clk = !clk;
+ always #2.5ns clk2x = !clk2x;
+ logic rst = 1;
+ initial begin
+ repeat(8) @(posedge clk);
+ rst <= 0;
+ end
+
+ //- AXI-lite Interface --------------
+ // Write
+ uwire awready;
+ logic awvalid;
+ logic [AXILITE_ADDR_WIDTH-1:0] awaddr;
+
+ uwire wready;
+ logic wvalid;
+ logic [31:0] wdata;
+
+ uwire bready = 1;
+ uwire bvalid;
+ uwire [1:0] bresp;
+
+ // Read
+ uwire arready;
+ logic arvalid;
+ logic [AXILITE_ADDR_WIDTH-1:0] araddr;
+
+ logic rready;
+ uwire rvalid;
+ uwire [ 1:0] rresp;
+ uwire [31:0] rdata;
+
+ // Streamed Output
+ logic ordy;
+ uwire ovld;
+ uwire [WIDTH-1:0] odat;
+
+ //-----------------------------------------------------------------------
+ // DUT
+ memstream_axi #(.DEPTH(DEPTH), .WIDTH(WIDTH), .PUMPED_MEMORY(PUMPED_MEMORY)) dut (
+ // Global Control
+ .clk, .clk2x, .rst,
+
+ // AXI-lite Write
+ .awready, .awvalid, .awaddr, .awprot('x),
+ .wready, .wvalid, .wdata, .wstrb('1),
+ .bready, .bvalid, .bresp,
+
+ // AXI-lite Read
+ .arready, .arvalid, .araddr, .arprot('x),
+ .rready, .rvalid, .rdata, .rresp,
+
+ // Continuous output stream
+ .m_axis_0_tready(ordy), .m_axis_0_tvalid(ovld), .m_axis_0_tdata(odat)
+ );
+
+ always_ff @(posedge clk iff !rst) begin
+ assert(!bvalid || !bresp) else begin
+ $error("Write error.");
+ $stop;
+ end
+ end
+
+ initial begin
+ awvalid = 0;
+ awaddr = 'x;
+ wvalid = 0;
+ wdata = 'x;
+ arvalid = 0;
+ araddr = 'x;
+ rready = 0;
+ ordy = 0;
+ @(posedge clk iff !rst);
+
+ // Configuration
+ fork
+ begin
+ awvalid <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ awaddr <= { i, 2'b00 };
+ @(posedge clk iff awready);
+ end
+ awvalid <= 0;
+ end
+ begin
+ wvalid <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ wdata <= i;
+ @(posedge clk iff wready);
+ end
+ wvalid <= 0;
+ end
+ join
+
+ // Read Last Entry for Sync
+ arvalid <= 1;
+ araddr <= { DEPTH-1, 2'b00 };
+ @(posedge clk iff arready);
+ arvalid <= 0;
+ araddr <= 'x;
+
+ rready <= 1;
+ @(posedge clk iff rvalid);
+ rready <= 0;
+ assert(!rresp && (rdata == DEPTH-1)) else begin
+ $error("Read back error.");
+ $stop;
+ end
+
+ // Reset Output Pipeline
+ rst <= 1;
+ @(posedge clk);
+ rst <= 0;
+
+ // One Round of Unimpeded Stream Read
+ ordy <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ @(posedge clk iff ovld);
+ assert(odat == i) else begin
+ $error("Unexpected output: %0d instead of %0d", odat, i);
+ $stop;
+ end
+ end
+ ordy <= 0;
+
+ // Another Round with Intermittent Backpressure
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ while($urandom()%13 == 0) @(posedge clk);
+ ordy <= 1;
+ @(posedge clk iff ovld);
+ ordy <= 0;
+ assert(odat == i) else begin
+ $error("Unexpected output: %0d instead of %0d", odat, i);
+ $stop;
+ end
+ end
+
+ // Yet Another Round Adding Intermittent Readbacks
+ fork
+ automatic bit done = 0;
+
+ begin
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ while($urandom()%13 == 0) @(posedge clk);
+ ordy <= 1;
+ @(posedge clk iff ovld);
+ ordy <= 0;
+ assert(odat == i) else begin
+ $error("Unexpected output: %0d instead of %0d", odat, i);
+ $stop;
+ end
+ end
+ done = 1;
+ end
+ begin
+ while(!done) begin
+ automatic int av = $urandom() % DEPTH;
+ repeat($urandom()%19) @(posedge clk);
+ arvalid <= 1;
+ araddr <= { av, 2'b00 };
+ @(posedge clk iff arready);
+ arvalid <= 0;
+ araddr <= 'x;
+
+ rready <= 1;
+ @(posedge clk iff rvalid);
+ rready <= 0;
+ assert(!rresp && (rdata == av)) else begin
+ $error("Read back error.");
+ $stop;
+ end
+ end
+ end
+ join
+
+ repeat(2) @(posedge clk);
+ $display("Test completed.");
+ $finish;
+ end
+
+endmodule : memstream_axi_tb
diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
index e802d81c79..d2bffc9f1c 100644
--- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
+++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
@@ -12,6 +12,9 @@ proc init_gui { IPINST } {
ipgui::add_param $IPINST -name "INIT_FILE" -parent ${Page_0}
ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0}
ipgui::add_param $IPINST -name "WIDTH" -parent ${Page_0}
+
+ ipgui::add_param $IPINST -name "PUMPED_MEMORY"
+
}
proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.DEPTH PARAM_VALUE.WIDTH } {
@@ -48,6 +51,15 @@ proc validate_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } {
return true
}
+proc update_PARAM_VALUE.PUMPED_MEMORY { PARAM_VALUE.PUMPED_MEMORY } {
+ # Procedure called to update PUMPED_MEMORY when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.PUMPED_MEMORY { PARAM_VALUE.PUMPED_MEMORY } {
+ # Procedure called to validate PUMPED_MEMORY
+ return true
+}
+
proc update_PARAM_VALUE.RAM_STYLE { PARAM_VALUE.RAM_STYLE } {
# Procedure called to update RAM_STYLE when any of the dependent parameters in the arguments change
}
@@ -87,6 +99,11 @@ proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.
set_property value [get_property value ${PARAM_VALUE.RAM_STYLE}] ${MODELPARAM_VALUE.RAM_STYLE}
}
+proc update_MODELPARAM_VALUE.PUMPED_MEMORY { MODELPARAM_VALUE.PUMPED_MEMORY PARAM_VALUE.PUMPED_MEMORY } {
+ # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+ set_property value [get_property value ${PARAM_VALUE.PUMPED_MEMORY}] ${MODELPARAM_VALUE.PUMPED_MEMORY}
+}
+
proc update_MODELPARAM_VALUE.AXILITE_ADDR_WIDTH { MODELPARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.AXILITE_ADDR_WIDTH } {
# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
set_property value [get_property value ${PARAM_VALUE.AXILITE_ADDR_WIDTH}] ${MODELPARAM_VALUE.AXILITE_ADDR_WIDTH}
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 0ee84b2f79..b2f2e582b2 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -58,7 +58,7 @@ module mvu_vvu_axi #(
bit NARROW_WEIGHTS = 0,
bit SIGNED_ACTIVATIONS = 0,
- bit PUMPED_COMPUTE = 0,
+ bit PUMPED_COMPUTE = 0, // requires an even SIMD % 2 == 0
bit FORCE_BEHAVIORAL = 0,
bit M_REG_LUT = 1,
@@ -218,12 +218,10 @@ module mvu_vvu_axi #(
// Identify second fast cycle just before active slow clock edge
logic Active = 0;
- if(1) begin : blkActive
- uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net
- (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk));
- (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0]));
- always_ff @(posedge clk2x) Active <= clk_lut[1];
- end : blkActive
+ always_ff @(posedge clk2x) begin
+ if(rst) Active <= 0;
+ else Active <= !Active;
+ end
// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
// - Both fast cycles are controlled by the same enable state.
@@ -300,6 +298,20 @@ module mvu_vvu_axi #(
case(COMPUTE_CORE)
"mvu_vvu_8sx9_dsp58":
+ if(PUMPED_COMPUTE) begin
+ mvu_vvu_8sx9_dsp58 #(
+ .IS_MVU(IS_MVU),
+ .PE(PE), .SIMD(DSP_SIMD),
+ .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+ .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+ .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+ ) core (
+ .clk(clk2x), .rst, .en(dsp_en),
+ .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+ .vld(dsp_vld), .p(dsp_p)
+ );
+ end
+ else begin
mvu_vvu_8sx9_dsp58 #(
.IS_MVU(IS_MVU),
.PE(PE), .SIMD(DSP_SIMD),
@@ -307,10 +319,11 @@ module mvu_vvu_axi #(
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
- .clk(dsp_clk), .rst, .en(dsp_en),
+ .clk(clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
);
+ end
"mvu_4sx4u_dsp48e1":
mvu_4sx4u #(
.PE(PE), .SIMD(DSP_SIMD),
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
index 4edf676008..cb3a0d4779 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -34,7 +34,7 @@
module $MODULE_NAME_AXI_WRAPPER$ #(
parameter IS_MVU = $IS_MVU$,
parameter COMPUTE_CORE = "$COMPUTE_CORE$",
- parameter PUMPED_COMPUTE = 0,
+ parameter PUMPED_COMPUTE = $PUMPED_COMPUTE$,
parameter MW = $MW$,
parameter MH = $MH$,
parameter PE = $PE$,
@@ -56,9 +56,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
input ap_clk,
- // (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *)
- // (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
- // input ap_clk2x,
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+ input ap_clk2x,
(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
input ap_rst_n,
@@ -82,7 +82,7 @@ mvu_vvu_axi #(
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) inst (
.ap_clk(ap_clk),
- .ap_clk2x(1'b0), // wired to ground since double-pumped compute not enabled through FINN for now
+ .ap_clk2x(ap_clk2x),
.ap_rst_n(ap_rst_n),
.s_axis_weights_tdata(weights_V_TDATA),
.s_axis_weights_tvalid(weights_V_TVALID),
diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v
index 22dc6bd8cd..bb657a7478 100644
--- a/finn-rtllib/swg/swg_template_wrapper.v
+++ b/finn-rtllib/swg/swg_template_wrapper.v
@@ -71,4 +71,8 @@ $TOP_MODULE_NAME$_impl #(
.out_V_V_TREADY(out_V_TREADY)
);
+if (OUT_WIDTH_PADDED > BUF_OUT_WIDTH) begin
+ assign out_V_TDATA[OUT_WIDTH_PADDED-1:BUF_OUT_WIDTH] = {(OUT_WIDTH_PADDED-BUF_OUT_WIDTH){1'b0}};
+end
+
endmodule : $TOP_MODULE_NAME$
diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
index 158f3132e3..7e49d3eafb 100644
--- a/finn-rtllib/swg/swg_template_wrapper_dynamic.v
+++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
@@ -180,4 +180,8 @@ $TOP_MODULE_NAME$_impl #(
.cfg_last_write(cfg_last_write)
);
+if (OUT_WIDTH_PADDED > BUF_OUT_WIDTH) begin
+ assign out_V_TDATA[OUT_WIDTH_PADDED-1:BUF_OUT_WIDTH] = {(OUT_WIDTH_PADDED-BUF_OUT_WIDTH){1'b0}};
+end
+
endmodule : $TOP_MODULE_NAME$
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
index 39756e5c2b..04c13424c9 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -191,7 +191,10 @@ module thresholding_axi #(
.cfg_rack, .cfg_q,
.irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat,
- .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata)
+ .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata[PE*O_BITS-1:0])
);
+ if($bits(m_axis_tdata) > PE*O_BITS) begin : genPadOut
+ assign m_axis_tdata[$left(m_axis_tdata):PE*O_BITS] = '0;
+ end : genPadOut
endmodule : thresholding_axi
diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
index 49a1f2bd8b..28d0238c50 100644
--- a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
+++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
@@ -25,7 +25,7 @@
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF/scratch/finn/test/code_gen_ipgen_Thresholding_rtl_0_n9w6opfh/Thresholding_rtl_0.v
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author Thomas B. Preußer
@@ -40,7 +40,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
parameter PE = $PE$, // Processing Parallelism, requires C = k*PE
parameter SIGNED = $SIGNED$, // signed inputs
- parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa
+ parameter FPARG = $FPARG$, // floating-point inputs: [sign] | exponent | mantissa
parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data
diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
index cfd875f5c4..1a2b8402a0 100644
--- a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
+++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
@@ -232,7 +232,7 @@ module thresholding_axi_tb #(
end
join_any
done <= 1;
- repeat(N+6) @(posedge clk);
+ repeat(2*N+8) @(posedge clk);
assert(QW.size() == 0) else begin
$error("Missing %0d outputs.", QW.size());
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index aacd12ef05..e914781b21 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -404,6 +404,7 @@
"child_model = child_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))\n",
"child_model = child_model.transform(PrepareRTLSim())\n",
"child_model.set_metadata_prop(\"exec_mode\",\"rtlsim\")\n",
+ "child_model.set_metadata_prop(\"rtlsim_backend\",\"pyxsi\")\n",
"child_model.save(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\");"
]
},
diff --git a/python_repos.txt b/python_repos.txt
new file mode 100644
index 0000000000..c330aa6967
--- /dev/null
+++ b/python_repos.txt
@@ -0,0 +1,5 @@
+dir,url,commit_hash
+qonnx,https://github.com/fastmachinelearning/qonnx.git,ca91dbe24e8d0122ba981070b918be31fb60750e
+finn-experimental,https://github.com/Xilinx/finn-experimental.git,0724be21111a21f0d81a072fccc1c446e053f851
+brevitas,https://github.com/Xilinx/brevitas.git,0ea7bac8f7d7b687c1ac0c8cb4712ad9885645c5
+pyverilator,https://github.com/maltanar/pyverilator.git,ce0a08c20cb8c1d1e84181d6f392390f846adbd1
diff --git a/requirements.txt b/requirements.txt
index 1683695576..a0791b5a88 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-bitstring==3.1.7
+bitstring==4.2.3
clize==5.0.1
dataclasses-json==0.5.7
gspread==3.6.0
@@ -8,6 +8,7 @@ numpy==1.24.1
onnx==1.17.0
onnxoptimizer
onnxruntime==1.18.1
+onnxsim==0.4.36
pre-commit==3.3.2
protobuf==3.20.3
psutil==5.9.4
@@ -16,5 +17,6 @@ scipy==1.10.1
setupext-janitor>=1.1.2
sigtools==4.0.1
toposort==1.7.0
+transformers==4.46.3
vcdvcd==1.0.5
wget==3.2
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index ab2280554c..bddf4395ca 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -109,6 +109,7 @@
InsertAndSetFIFODepths,
RemoveShallowFIFOs,
SplitLargeFIFOs,
+ xsi_fifosim,
)
from finn.transformation.fpgadataflow.set_folding import SetFolding
from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
@@ -126,7 +127,6 @@
get_rtlsim_trace_depth,
pyverilate_get_liveness_threshold_cycles,
)
-from finn.util.pyverilator import verilator_fifosim
from finn.util.test import execute_parent
@@ -250,6 +250,8 @@ def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
# set top-level prop for stitched-ip rtlsim and launch
verify_model.set_metadata_prop("exec_mode", "rtlsim")
# TODO make configurable
+ verify_model.set_metadata_prop("rtlsim_backend", "pyxsi")
+ # TODO make configurable
# verify_model.set_metadata_prop("rtlsim_trace", "trace.vcd")
return verify_model
@@ -719,7 +721,7 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"]
else:
- rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
+ rtlsim_perf_dict = xsi_fifosim(model, rtlsim_bs)
# keep keys consistent between the Python and C++-styles
cycles = rtlsim_perf_dict["cycles"]
clk_ns = float(model.get_metadata_prop("clk_ns"))
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 588e97e9e4..7c0d69e17a 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -52,44 +52,38 @@ def execute_onnx(model, input_dict, return_full_exec_context=False, start_node=N
model_exec_mode = model.get_metadata_prop("exec_mode")
if (model_exec_mode is None) or (model_exec_mode == ""):
return execute_onnx_base(model, input_dict, return_full_exec_context, start_node, end_node)
+ elif model_exec_mode == "rtlsim":
+ # check sanity of model and then use stitched IP for rtlsim
+ if not model.check_all_tensor_shapes_specified():
+ raise Exception("Found unspecified tensor shapes, try infer_shapes")
+ ret = model.analysis(ta.nodes_topologically_sorted)
+ assert (
+ ret["nodes_topologically_sorted"] is True
+ ), """Nodes must be
+ topologically sorted."""
- if not model.check_all_tensor_shapes_specified():
- raise Exception("Found unspecified tensor shapes, try infer_shapes")
- ret = model.analysis(ta.nodes_topologically_sorted)
- assert (
- ret["nodes_topologically_sorted"] is True
- ), """Nodes must be
- topologically sorted."""
-
- graph = model.graph
- # first, we need to make sure that every variable required by the graph has
- # some buffer associated with it. this includes graph inputs (which includes
- # the input data as well as the trained parameters) and the graph ValueInfo
- # (intermediate tensors between layers)
- # this is provided by the execution_context, which is a dict of np.ndarray
- execution_context = model.make_empty_exec_context()
- # fill in any inputs provided to this function
- for inp_name in input_dict.keys():
- if inp_name in execution_context:
- if execution_context[inp_name].shape == input_dict[inp_name].shape:
- execution_context[inp_name] = input_dict[inp_name]
- else:
- raise Exception(
- "Shape mismatch for provided input %s: found %s expected %s "
- % (
- inp_name,
- str(execution_context[inp_name].shape),
- str(input_dict[inp_name].shape),
+ graph = model.graph
+ # first, we need to make sure that every variable required by the graph has
+ # some buffer associated with it. this includes graph inputs (which includes
+ # the input data as well as the trained parameters) and the graph ValueInfo
+ # (intermediate tensors between layers)
+ # this is provided by the execution_context, which is a dict of np.ndarray
+ execution_context = model.make_empty_exec_context()
+ # fill in any inputs provided to this function
+ for inp_name in input_dict.keys():
+ if inp_name in execution_context:
+ if execution_context[inp_name].shape == input_dict[inp_name].shape:
+ execution_context[inp_name] = input_dict[inp_name]
+ else:
+ raise Exception(
+ "Shape mismatch for provided input %s: found %s expected %s "
+ % (
+ inp_name,
+ str(execution_context[inp_name].shape),
+ str(input_dict[inp_name].shape),
+ )
)
- )
- # check if model has an execution mode set
- # if None, execute model node by node using execute_node()
- # if set to "rtlsim" execute model using pyverilator
- model_exec_mode = model.get_metadata_prop("exec_mode")
- if (model_exec_mode is None) or (model_exec_mode == ""):
- return execute_onnx_base()
- elif model_exec_mode == "rtlsim":
# use stitched IP for rtlsim
rtlsim_exec(model, execution_context)
else:
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 0bac40f503..71230d8eb8 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -26,11 +26,18 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import numpy as np
import os
from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io
from qonnx.custom_op.registry import getCustomOp
-from finn.util.basic import pyverilate_get_liveness_threshold_cycles
+from finn.util.basic import (
+ get_finn_root,
+ get_vivado_root,
+ launch_process_helper,
+ make_build_dir,
+ pyverilate_get_liveness_threshold_cycles,
+)
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
from finn.util.pyverilator import pyverilate_stitched_ip
@@ -39,35 +46,13 @@
except ModuleNotFoundError:
PyVerilator = None
+try:
+ import pyxsi_utils
+except ModuleNotFoundError:
+ pyxsi_utils = None
-def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
- """Use PyVerilator to execute given model with stitched IP. The execution
- context contains the input values. Hook functions can be optionally
- specified to observe/alter the state of the circuit, receiving the
- PyVerilator sim object as their first argument:
- - pre_hook : hook function to be called before sim start (after reset)
- - post_hook : hook function to be called after sim end
- """
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
- # ensure stitched ip project already exists
- assert os.path.isfile(
- model.get_metadata_prop("wrapper_filename")
- ), """The
- file name from metadata property "wrapper_filename" doesn't exist."""
- assert os.path.isdir(
- model.get_metadata_prop("vivado_stitch_proj")
- ), """The
- directory from metadata property "vivado_stitch_proj" doesn't exist"""
- trace_file = model.get_metadata_prop("rtlsim_trace")
- if trace_file is None:
- trace_file = ""
- extra_verilator_args = model.get_metadata_prop("extra_verilator_args")
- if extra_verilator_args is None:
- extra_verilator_args = []
- else:
- extra_verilator_args = eval(extra_verilator_args)
+def prep_rtlsim_io_dict(model, execution_context):
# extract i/o info to prepare io_dict
io_dict = {"inputs": {}, "outputs": {}}
if_dict = eval(model.get_metadata_prop("vivado_stitch_ifnames"))
@@ -125,6 +110,286 @@ def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
o_stream_w = last_node.get_outstream_width()
o_tensor_info.append((o_stream_w, o_dt, o_folded_shape, o_shape))
num_out_values += batchsize * last_node.get_number_output_values()
+ return io_dict, if_dict, num_out_values, o_tensor_info
+
+
+def file_to_basename(x):
+ return os.path.basename(os.path.realpath(x))
+
+
+def rtlsim_exec_cppxsi(model, execution_context, dummy_data_mode=False, postproc_cpp=""):
+ """Use XSI C++ rtl simulation to execute given model with stitched IP.
+ The dummy_data_mode flag controls whether the simulation is driven by
+ dummy data or real data. The execution_context parameter must be formatted
+ according to whether dummy or real data is used.
+ Example with dummy_data = True:
+ execution_context = {
+ "inputs" : {"" : },
+ "outputs" : {"" : },
+ }
+ Example with dummy_data = False:
+ execution_context = {
+ "" :
+ }
+
+ The postproc_cpp optional argument can be used to inject C++ code to retrieve
+ extra data when the simulation is finished. See the @POSTPROC_CPP@ template argument
+ in the xsi_simdriver.cpp file to see what context and functions are available.
+
+ """
+ # TODO: support running functional rtlsim with real I/O data
+ # TODO: support running with multiple inputs/outputs
+ # TODO: rename utility fxn to remove "pyverilate", used for other backends too
+ timeout_cycles = pyverilate_get_liveness_threshold_cycles()
+
+ assert dummy_data_mode, "Only dummy_data_mode=True is supported for now"
+
+ # ensure stitched ip project already exists
+ assert os.path.isfile(
+ model.get_metadata_prop("wrapper_filename")
+ ), """The
+ file name from metadata property "wrapper_filename" doesn't exist."""
+ assert os.path.isdir(
+ model.get_metadata_prop("vivado_stitch_proj")
+ ), """The
+ directory from metadata property "vivado_stitch_proj" doesn't exist"""
+ trace_file = model.get_metadata_prop("rtlsim_trace")
+ if not dummy_data_mode:
+ io_dict, if_dict, num_out_values, o_tensor_info = prep_rtlsim_io_dict(
+ model, execution_context
+ )
+
+ # prepare rtlsim compiled object (unless it already exists)
+ rtlsim_so = model.get_metadata_prop("rtlsim_so")
+ top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+ top_module_name = top_module_file_name.strip(".v")
+ if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
+ vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
+ with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
+ all_verilog_srcs = f.read().split()
+ single_src_dir = make_build_dir("rtlsim_" + top_module_name + "_")
+
+ rtlsim_so = pyxsi_utils.compile_sim_obj(top_module_name, all_verilog_srcs, single_src_dir)
+ # save generated lib filename in attribute
+ model.set_metadata_prop("rtlsim_so", rtlsim_so[0] + "/" + rtlsim_so[1])
+ sim_base, sim_rel = rtlsim_so
+ # pass in correct tracefile from attribute
+ if trace_file == "default":
+ trace_file = top_module_file_name + ".wdb"
+ else:
+ sim_base, sim_rel = rtlsim_so.split("xsim.dir")
+ sim_rel = "xsim.dir" + sim_rel
+ # prepare the C++ sim driver template
+ fifosim_cpp_fname = get_finn_root() + "/src/finn/qnn-data/cpp/xsi_simdriver.cpp"
+ with open(fifosim_cpp_fname, "r") as f:
+ fifosim_cpp_template = f.read()
+
+ instream_iters = []
+ outstream_iters = []
+ for top_inp in model.graph.input:
+ iname = top_inp.name
+ first_node = model.find_consumer(iname)
+ assert first_node is not None, "Failed to find consumer for " + iname
+ fnode_inst = getCustomOp(first_node)
+ top_ind = list(first_node.input).index(iname)
+ ishape_folded = fnode_inst.get_folded_input_shape(ind=top_ind)
+ instream_iters.append(np.prod(ishape_folded[:-1]))
+ for top_out in model.graph.output:
+ oname = top_out.name
+ last_node = model.find_producer(oname)
+ assert last_node is not None, "Failed to find producer for " + oname
+ lnode_inst = getCustomOp(last_node)
+ top_ind = list(last_node.output).index(oname)
+ oshape_folded = lnode_inst.get_folded_output_shape(ind=top_ind)
+ outstream_iters.append(np.prod(oshape_folded[:-1]))
+
+ # retrieve the number of inputs from execution_context
+ n_inferences = execution_context[model.graph.input[0].name]
+ # determine according to presence of clk2x
+ ifnames = model.get_metadata_prop("vivado_stitch_ifnames")
+ assert not (
+ ifnames is None
+ ), "Couldn't find stitched-IP interface names, did you run IP stitching first?"
+ ifnames = eval(ifnames)
+ if "clk2x" in ifnames.keys():
+ is_double_pumped = ifnames["clk2x"] != []
+ else:
+ is_double_pumped = False
+ clknames = "clk_and_clk2x" if is_double_pumped else "clk"
+ instream_names = [x[0] for x in ifnames["s_axis"]]
+ instream_names_str = "{" + ", ".join(['"' + x + '"' for x in instream_names]) + "}"
+ outstream_names = [x[0] for x in ifnames["m_axis"]]
+ outstream_names_str = "{" + ", ".join(['"' + x + '"' for x in outstream_names]) + "}"
+ instream_iters_str = "{" + ", ".join([str(x) for x in instream_iters]) + "}"
+ outstream_iters_str = "{" + ", ".join([str(x) for x in outstream_iters]) + "}"
+ # fill in the template arguments for sim driver
+ template_dict = {
+ # number of input transactions per inference
+ "ITERS_PER_INPUT": instream_iters_str,
+ # number of output transactions per inference
+ "ITERS_PER_OUTPUT": outstream_iters_str,
+ # number of inferences
+ "N_INFERENCES": n_inferences,
+ # max number of cycles to wait for output activity before timeout
+ "MAX_ITERS": timeout_cycles,
+ # name of the top-level HDL module
+ "TOP_MODULE_NAME": top_module_name,
+ # names of the top-level AXI streams and signals
+ "INSTREAM_NAME": instream_names_str,
+ "OUTSTREAM_NAME": outstream_names_str,
+ "CLK_NAME": "ap_clk",
+ "CLK2X_NAME": "ap_clk2x",
+ "CLKNAMES": clknames,
+ "NRST_NAME": "ap_rst_n",
+ # control tracing and trace filename
+ "TRACE_FILE": "NULL" if trace_file is None else f'"{trace_file}"',
+ "TRACE_CMD": "" if trace_file is None else "top->trace_all();",
+ # code to post-process final sim status to extract more data
+ "POSTPROC_CPP": postproc_cpp,
+ # sim kernel .so to use (depends on Vivado version)
+ "SIMKERNEL_SO": pyxsi_utils.get_simkernel_so(),
+ }
+ for key, val in template_dict.items():
+ fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val))
+ with open(sim_base + "/rtlsim_xsi.cpp", "w") as f:
+ f.write(fifosim_cpp_template)
+
+ vivado_incl_dir = get_vivado_root() + "/data/xsim/include"
+ xsi_include_dir = get_finn_root() + "/deps/pyxsi/src"
+ # launch g++ to compile the rtlsim executable
+ build_cmd = [
+ "g++",
+ f"-I{xsi_include_dir}",
+ f"-I{vivado_incl_dir}",
+ "-std=c++14",
+ "-O3",
+ "-o",
+ "rtlsim_xsi",
+ "rtlsim_xsi.cpp",
+ f"{xsi_include_dir}/xsi_loader.cpp",
+ "-ldl",
+ "-lrt",
+ ]
+ # write compilation command to a file for easy re-running/debugging
+ with open(sim_base + "/compile_rtlsim.sh", "w") as f:
+ f.write(" ".join(build_cmd))
+ launch_process_helper(build_cmd, cwd=sim_base)
+ assert os.path.isfile(sim_base + "/rtlsim_xsi"), "Failed to compile rtlsim executable"
+
+ # launch the rtlsim executable
+ # important to specify LD_LIBRARY_PATH here for XSI to work correctly
+ runsim_env = os.environ.copy()
+ runsim_env["LD_LIBRARY_PATH"] = get_vivado_root() + "/lib/lnx64.o"
+ runsim_cmd = ["./rtlsim_xsi"]
+ with open(sim_base + "/run_rtlsim.sh", "w") as f:
+ f.write(f"LD_LIBRARY_PATH={runsim_env['LD_LIBRARY_PATH']} ./rtlsim_xsi")
+ launch_process_helper(runsim_cmd, proc_env=runsim_env, cwd=sim_base)
+
+ # parse results file and return dict
+ with open(sim_base + "/results.txt", "r") as f:
+ results = f.read().strip().split("\n")
+ ret_dict = {}
+ for result_line in results:
+ key, val = result_line.split("\t")
+ ret_dict[key] = int(val)
+ return ret_dict
+
+
+def rtlsim_exec_pyxsi(model, execution_context, pre_hook=None, post_hook=None):
+ """Use PyXSI to execute given model with stitched IP. The execution
+ context contains the input values. Hook functions can be optionally
+ specified to observe/alter the state of the circuit, receiving the
+ PyXSI RPC sim handle as their first argument:
+ - pre_hook : hook function to be called before sim start (after reset)
+ - post_hook : hook function to be called after sim end
+ """
+ # ensure stitched ip project already exists
+ assert os.path.isfile(
+ model.get_metadata_prop("wrapper_filename")
+ ), """The
+ file name from metadata property "wrapper_filename" doesn't exist."""
+ assert os.path.isdir(
+ model.get_metadata_prop("vivado_stitch_proj")
+ ), """The
+ directory from metadata property "vivado_stitch_proj" doesn't exist"""
+ trace_file = model.get_metadata_prop("rtlsim_trace")
+ io_dict, if_dict, num_out_values, o_tensor_info = prep_rtlsim_io_dict(model, execution_context)
+
+ # prepare rtlsim model
+ rtlsim_so = model.get_metadata_prop("rtlsim_so")
+ if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
+ vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
+ with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
+ all_verilog_srcs = f.read().split()
+ top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename"))
+ top_module_name = top_module_file_name.strip(".v")
+ single_src_dir = make_build_dir("rtlsim_" + top_module_name + "_")
+
+ rtlsim_so = pyxsi_utils.compile_sim_obj(top_module_name, all_verilog_srcs, single_src_dir)
+ # save generated lib filename in attribute
+ model.set_metadata_prop("rtlsim_so", rtlsim_so[0] + "/" + rtlsim_so[1])
+ sim_base, sim_rel = rtlsim_so
+ # pass in correct tracefile from attribute
+ if trace_file == "default":
+ trace_file = top_module_file_name + ".wdb"
+ sim = pyxsi_utils.load_sim_obj(sim_base, sim_rel, trace_file)
+ else:
+ sim_base, sim_rel = rtlsim_so.split("xsim.dir")
+ sim_rel = "xsim.dir" + sim_rel
+ sim = pyxsi_utils.load_sim_obj(sim_base, sim_rel, trace_file)
+
+ # reset and call rtlsim, including any pre/post hooks
+ pyxsi_utils.reset_rtlsim(sim)
+ if pre_hook is not None:
+ pre_hook(sim)
+ n_cycles = pyxsi_utils.rtlsim_multi_io(
+ sim,
+ io_dict,
+ num_out_values,
+ sname="_",
+ liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+ )
+ if post_hook is not None:
+ post_hook(sim)
+ # important to call close_rtlsim for pyxsi to flush traces and stop
+ # the RPC server process
+ pyxsi_utils.close_rtlsim(sim)
+
+ # unpack outputs and put back into execution context
+ for o, o_vi in enumerate(model.graph.output):
+ o_name = o_vi.name
+ if_name = if_dict["m_axis"][o][0]
+ o_stream_w, o_dt, o_folded_shape, o_shape = o_tensor_info[o]
+ packed_output = io_dict["outputs"][if_name]
+ o_folded_tensor = rtlsim_output_to_npy(
+ packed_output, None, o_dt, o_folded_shape, o_stream_w, o_dt.bitwidth()
+ )
+ execution_context[o_name] = o_folded_tensor.reshape(o_shape)
+
+ model.set_metadata_prop("cycles_rtlsim", str(n_cycles))
+
+
+def rtlsim_exec_pyverilator(model, execution_context, pre_hook=None, post_hook=None):
+ if PyVerilator is None:
+ raise ImportError("Installation of PyVerilator is required.")
+ # ensure stitched ip project already exists
+ assert os.path.isfile(
+ model.get_metadata_prop("wrapper_filename")
+ ), """The
+ file name from metadata property "wrapper_filename" doesn't exist."""
+ assert os.path.isdir(
+ model.get_metadata_prop("vivado_stitch_proj")
+ ), """The
+ directory from metadata property "vivado_stitch_proj" doesn't exist"""
+ trace_file = model.get_metadata_prop("rtlsim_trace")
+ if trace_file is None:
+ trace_file = ""
+ extra_verilator_args = model.get_metadata_prop("extra_verilator_args")
+ if extra_verilator_args is None:
+ extra_verilator_args = []
+ else:
+ extra_verilator_args = eval(extra_verilator_args)
+ io_dict, if_dict, num_out_values, o_tensor_info = prep_rtlsim_io_dict(model, execution_context)
# prepare pyverilator model
rtlsim_so = model.get_metadata_prop("rtlsim_so")
@@ -161,3 +426,21 @@ def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
execution_context[o_name] = o_folded_tensor.reshape(o_shape)
model.set_metadata_prop("cycles_rtlsim", str(n_cycles))
+
+
+def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None):
+ """Use PyVerilator or PyXSI to execute given model with stitched IP, depending
+ on the rtlsim_backend metadata_prop on the model. The execution
+ context contains the input values. Hook functions can be optionally
+ specified to observe/alter the state of the circuit, receiving the
+ PyVerilator sim object as their first argument:
+ - pre_hook : hook function to be called before sim start (after reset)
+ - post_hook : hook function to be called after sim end
+ """
+ backend = model.get_metadata_prop("rtlsim_backend")
+ if backend == "pyverilator":
+ rtlsim_exec_pyverilator(model, execution_context, pre_hook, post_hook)
+ elif backend == "pyxsi":
+ rtlsim_exec_pyxsi(model, execution_context, pre_hook, post_hook)
+ else:
+ assert False, f"Unrecognized rtlsim_backend value: {backend}"
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..4f2f69445e 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -27,6 +27,33 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# The base class of all generic custom operations before specializing to either
+# HLS or RTL backend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Dictionary of HWCustomOp implementations
+custom_op = dict()
+
+
+# Registers a class into the custom_op dictionary
+# Note: This must be defined first, before importing any custom op
+# implementation to avoid "importing partially initialized module" issues.
+def register_custom_op(cls):
+ # The class must actually implement HWCustomOp
+ assert issubclass(cls, HWCustomOp), f"{cls} must subclass {HWCustomOp}"
+ # Insert the class into the custom_op dictionary by its name
+ custom_op[cls.__name__] = cls # noqa: Some weird type annotation issue?
+ # Pass through the class unmodified
+ return cls
+
+
+# flake8: noqa
+# Disable linting from here, as all import will be flagged E402 and maybe F401
+
+
+# Import the submodule containing specializations of ElementwiseBinaryOperation
+# Note: This will automatically register all decorated classes into this domain
+import finn.custom_op.fpgadataflow.elementwise_binary
from finn.custom_op.fpgadataflow.addstreams import AddStreams
from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
from finn.custom_op.fpgadataflow.concat import StreamingConcat
@@ -55,8 +82,6 @@
from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
-custom_op = dict()
-
# make sure new HLSCustomOp subclasses are imported here so that they get
# registered and plug in correctly into the infrastructure
custom_op["MVAU"] = MVAU
diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
new file mode 100644
index 0000000000..93078aab91
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -0,0 +1,974 @@
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Numpy math and arrays
+import numpy as np
+
+# Operating system stuff, e.g. paths
+import os
+
+# Python warning subsystem
+import warnings
+from functools import partial
+
+# Helper for creating ONNX nodes
+from onnx import helper as oh
+
+# QONNX/FINN datatypes
+from qonnx.core.datatype import DataType
+
+# QONNX wrapper to ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.quant import max_int, min_int
+
+# Utility for registering HWCustomOp implementations into the module scope
+from finn.custom_op.fpgadataflow import register_custom_op
+
+# Derive custom operators form the FINN base custom op
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Converts inputs/outputs to/from RTL simulation format
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+# Generic implementation for elementwise binary operations
+class ElementwiseBinaryOperation(HWCustomOp):
+ # Specifies the elementwise operation to be implemented
+ # Format: (Identifier, Python, C++, RTL)
+ _operation: tuple[str, np.ufunc, str, str] | None = None
+
+ # Numpy operation available as property
+ @property
+ def npy_op(self) -> np.ufunc:
+ return self._operation[1]
+
+ # C++ operation template available as property
+ @property
+ def cpp_op(self) -> str:
+ return self._operation[2]
+
+ # RTL operation template available as property
+ @property
+ def rtl_op(self) -> str:
+ return self._operation[3]
+
+ # Initializes the operator given an onnx graph node
+ def __init__(self, onnx_node, **kwargs):
+ # Just forward all arguments to the init method of the CustomOp base
+ super().__init__(onnx_node, **kwargs)
+
+ # Defines attributes which must be present on this node
+ def get_nodeattr_types(self):
+ # Start from parent operator class attributes
+ attrs = HWCustomOp.get_nodeattr_types(self)
+ # Update attributes dictionary for new custom operator
+ attrs.update({
+ # Data type of the left-hand-side input elements
+ "lhs_dtype": ("s", True, ""),
+ # Data type of the right-hand-side input elements
+ "rhs_dtype": ("s", True, ""),
+ # Data type of the output elements
+ "out_dtype": ("s", True, ""),
+ # Shape of the left-hand-side input
+ "lhs_shape": ("ints", True, [1]),
+ # Shape of the right-hand-side input
+ "rhs_shape": ("ints", True, [1]),
+ # Shape of the output, mus correspond to multi-directional
+ # broadcasting of the left- and right-hand-side
+ "out_shape": ("ints", True, [1]),
+ # Style specifies how the left-hand-side input is provided
+ # Note: Might be inferred from the context
+ "lhs_style": ("s", False, "input", {"input", "const"}),
+ # Style specifies how the right-hand-side input is provided
+ # Note: Might be inferred from the context
+ "rhs_style": ("s", False, "input", {"input", "const"}),
+ # Number of elements in the last dimensions processed in parallel
+ "PE": ("i", False, 1),
+ # Possible execution modes for simulating this node
+ # Note: Override to support python mode
+ "exec_mode": (
+ "s", False, "python", {"", "rtlsim", "cppsim", "python"}
+ ),
+ # FPGA resource type for memories/internal buffers of the operator
+ "ram_style": (
+ "s", False, "auto", {"auto", "block", "distributed", "ultra"}
+ ),
+ # Input and output FIFO depths for multi-I/O nodes
+ # Note: Need to override here as there might be two inputs
+ "inFIFODepths": ("ints", False, [2, 2]),
+ "outFIFODepths": ("ints", False, [2]),
+ })
+ # Return updated attribute dictionary
+ return attrs
+
+ # Datatype attribute as property for convenience
+ @property
+ def lhs_dtype(self):
+ # Note: Converts from string to QONNX data type
+ return DataType[self.get_nodeattr("lhs_dtype")]
+
+ # Datatype attribute as property for convenience
+ @property
+ def rhs_dtype(self):
+ # Note: Converts from string to QONNX data type
+ return DataType[self.get_nodeattr("rhs_dtype")]
+
+ # Datatype attribute as property for convenience
+ @property
+ def out_dtype(self):
+ # Note: Converts from string to QONNX data type
+ return DataType[self.get_nodeattr("out_dtype")]
+
+ # Shape attribute as property for convenience
+ @property
+ def lhs_shape(self):
+ return self.get_nodeattr("lhs_shape")
+
+ # Shape attribute as property for convenience
+ @property
+ def rhs_shape(self):
+ return self.get_nodeattr("rhs_shape")
+
+ # Shape attribute as property for convenience
+ @property
+ def out_shape(self):
+ return self.get_nodeattr("out_shape")
+
+ # Style attribute as property for convenience
+ @property
+ def lhs_style(self):
+ return self.get_nodeattr("lhs_style")
+
+ # Style attribute as property for convenience
+ @property
+ def rhs_style(self):
+ return self.get_nodeattr("rhs_style")
+
+ # Number of parallel processed elements as property for convenience
+ @property
+ def pe(self):
+ return self.get_nodeattr("PE")
+
+ # Checks whether the last axis is broadcast
+ @property
+ def broadcast_last_axis(self):
+ return (self.lhs_shape[-1] == 1) != (self.rhs_shape[-1] == 1)
+
+ # Makes an operation compatible with the output shape for shape inference
+ # Note: Propagates shape forward, i.e., never asks for the shape of the
+ # output, even if it seems easier.
+ def make_shape_compatible_op(self, model: ModelWrapper): # noqa
+ # Get the node wrapped by this custom op
+ node = self.onnx_node
+ # There must be exactly two inputs to the binary operation
+ assert len(node.input) == 2, \
+ f"Binary operation {node.name} requires exactly two inputs"
+ # Validate input shapes match what is stored as attributes
+ assert model.get_tensor_shape(node.input[0]) == self.lhs_shape, \
+ f"Input shape mismatch: {node.name} {node.input[0]}"
+ assert model.get_tensor_shape(node.input[1]) == self.rhs_shape, \
+ f"Input shape mismatch: {node.name} {node.input[1]}"
+ # Validate broadcasting of inputs to the output shape
+ assert (list(np.broadcast_shapes(self.lhs_shape, self.rhs_shape))
+ == self.out_shape), f"Shape broadcast mismatch: {node.name}"
+ # Simulate behavior via the standard ONNX add operation
+ return oh.make_node("Add", node.input, node.output)
+
+ # Infers the datatype of the node output
+ def infer_node_datatype(self, model: ModelWrapper): # noqa
+ # Get the node wrapped by this custom op # noqa Duplicate
+ node = self.onnx_node
+ # Test for changing left-hand-side input datatype
+ if model.get_tensor_datatype(node.input[0]) != self.lhs_dtype:
+ # Get the new datatype
+ new_dtype = model.get_tensor_datatype(node.input[0])
+ # Issue a warning message
+ warnings.warn(
+ f"{node.name}: lhs_dtype changing from"
+ f" {self.lhs_dtype} to {new_dtype}"
+ )
+ # Set the new datatype attribute
+ self.set_nodeattr("lhs_dtype", new_dtype.name)
+ # Test for changing right-hand-side input datatype
+ if model.get_tensor_datatype(node.input[1]) != self.rhs_dtype:
+ # Get the new datatype
+ new_dtype = model.get_tensor_datatype(node.input[1])
+ # Issue a warning message
+ warnings.warn(
+ f"{node.name}: rhs_dtype changing from"
+ f" {self.rhs_dtype} to {new_dtype}"
+ )
+ # Set the new datatype attribute
+ self.set_nodeattr("rhs_dtype", new_dtype.name)
+ # Force the output data type stored as a node attribute
+ model.set_tensor_datatype(node.output[0], self.out_dtype)
+
+ # Executes elementwise operation in python
+ def _execute_node_python(self, context, graph): # noqa: graph unused
+ # Get the node wrapped by this custom op
+ node = self.onnx_node
+ # Get the inputs out of the execution context
+ lhs = context[node.input[0]]
+ rhs = context[node.input[1]]
+ # Note: Need to make sure these have the right type for the Numpy API
+ # Note: Always simulate integer inputs in int64, numpy casting is
+ # weird....
+ lhs = lhs.astype(np.int64) if self.lhs_dtype.is_integer() else lhs
+ rhs = rhs.astype(np.int64) if self.rhs_dtype.is_integer() else rhs
+ # Apply elementwise operation with broadcasting in numpy and insert
+ # result into the execution context
+ out = self.npy_op(lhs, rhs)
+ # Make sure the output has the right type, e.g. turn all booleans into
+ # integers (actually floats as the container type)
+ # Note: This is relevant for logical ops, ==, <=, >=, etc.
+ # Note: Somehow QONNX does not like boolean tensors
+ context[node.output[0]] = out.astype(self.out_dtype.to_numpy_dt())
+
+ # Executes elementwise operation in C++ simulation
+ def _execute_node_cppsim(self, context, graph): # noqa: graph unused
+ # C++ Simulation needs to be implemented in HLS backend specialization
+ raise NotImplementedError(
+ f"exec_mode cppsim of {self.__class__.__name__} is not implemented!"
+ )
+
+ # Executes elementwise operation in RTL simulation
+ def _execute_node_rtlsim(self, context, graph): # noqa: graph unused
+ # Get the node wrapped by this custom op # noqa Duplicate
+ node = self.onnx_node
+ # Input data is stored in numpy files in the code generation dictionary
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ # Get the inputs out of the execution context
+ lhs = context[node.input[0]] # noqa: Duplicate code prepare simulation
+ rhs = context[node.input[1]] # noqa: Duplicate code prepare simulation
+ # Validate the shape of the inputs
+ assert list(lhs.shape) == self.get_normal_input_shape(ind=0), \
+ f"Input shape mismatch for {node.input[0]}"
+ assert list(rhs.shape) == self.get_normal_input_shape(ind=1), \
+ f"Input shape mismatch for {node.input[1]} {rhs.shape=}"
+ # Reshape the inputs into folded form
+ lhs = lhs.reshape(self.get_folded_input_shape(ind=0))
+ rhs = rhs.reshape(self.get_folded_input_shape(ind=1))
+ # Path to store the intermediate inputs in numpy format
+ lhs_filename = os.path.join(code_gen_dir, "lhs.npy")
+ rhs_filename = os.path.join(code_gen_dir, "rhs.npy")
+ # Save the folded inputs to file to be used by simulation
+ np.save(lhs_filename, lhs)
+ np.save(rhs_filename, rhs)
+ # Start collecting inputs/outputs to the RTL simulation in a dictionary
+ # Note: Prepare one output empty output list
+ io_dict = {
+ "inputs": {},
+ "outputs": {"out": []}
+ }
+ # Type and width of the input tensors
+ lhs_dtype = self.get_input_datatype(ind=0)
+ lhs_width = self.get_instream_width(ind=0)
+ rhs_dtype = self.get_input_datatype(ind=1)
+ rhs_width = self.get_instream_width(ind=1)
+
+ # If the left-hand-side is provided as runtime input it needs to be
+ # inserted into the RTL simulation inputs
+ if self.lhs_style == "input":
+ # Convert inputs to RTL simulation format
+ io_dict["inputs"]["lhs"] = npy_to_rtlsim_input(
+ lhs_filename, lhs_dtype, lhs_width
+ )
+
+ # If the right-hand-side is provided as runtime input it needs to be
+ # inserted into the RTL simulation inputs
+ if self.rhs_style == "input":
+ # Convert inputs to RTL simulation format
+ io_dict["inputs"]["rhs"] = npy_to_rtlsim_input(
+ rhs_filename, rhs_dtype, rhs_width
+ )
+
+ # Setup PyVerilator simulation of the node
+ sim = self.get_rtlsim() # noqa: Duplicate code prepare simulation
+ # Reset the RTL simulation
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ # Run the RTL Simulation
+ self.rtlsim_multi_io(sim, io_dict)
+ # free up resources
+ self.close_rtlsim(sim)
+
+ # Collect the output from RTL simulation
+ out = io_dict["outputs"]["out"]
+ # Type and sizes of the output tensor
+ dtype = self.get_output_datatype(ind=0) # noqa: Duplicate readout code
+ width = self.get_outstream_width(ind=0)
+ shape = self.get_folded_output_shape(ind=0)
+ # Path to store the intermediate numpy file
+ filename = os.path.join(code_gen_dir, "out.npy")
+ # Convert from RTL simulation format to numpy format
+ rtlsim_output_to_npy(
+ out, filename, dtype, shape, width, dtype.bitwidth()
+ )
+ # Load the generated output numpy file
+ out = np.load(filename)
+ # Reshape the folded output and insert into the execution context
+ context[node.output[0]] = out.reshape(
+ self.get_normal_output_shape(ind=0)
+ )
+
+ # Executes elementwise op in simulation (either python c++ or rtl sim)
+ def execute_node(self, context, graph):
+ # Get the configured execution mode
+ mode = self.get_nodeattr("exec_mode")
+ # Lookup table mapping execution modes to implementing methods
+ exec_fns = {
+ "python": self._execute_node_python,
+ "cppsim": self._execute_node_cppsim,
+ "rtlsim": self._execute_node_rtlsim,
+ }
+ # Select and execute the function by mode string
+ exec_fns[mode](context, graph)
+
+ # Verifies the node attributes, inputs and outputs
+ def verify_node(self):
+ # TODO: Implement
+ return []
+
+ # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff
+
+ # Gets the datatype of input at index ind
+ def get_input_datatype(self, ind=0):
+ # Get input data type by index, order inputs from left to right
+ return [self.lhs_dtype, self.rhs_dtype][ind]
+
+ # Gets the datatype of the output at index ind
+ def get_output_datatype(self, ind=0):
+ # There is only one output, the type is set as an attribute
+ return self.out_dtype
+
+ # Gets the shape of the input at index ind without folding
+ def get_normal_input_shape(self, ind=0):
+ # Input shapes are stored as a node attributes
+ return [self.lhs_shape, self.rhs_shape][ind]
+
+ # Gets the shape of the output at index ind without folding
+ def get_normal_output_shape(self, ind=0):
+ # The output shape is stored as a node attribute
+ return self.out_shape
+
+ # Gets the shape of the input at index ind with folding
+ def get_folded_input_shape(self, ind=0):
+ # Get the normal shape before applying folding
+ *num_inputs, num_elems = self.get_normal_input_shape(ind=ind)
+ # Folding only applies if the folded axis is not broadcast
+ if not self.broadcast_last_axis or num_elems != 1:
+ # Valid folding requires the PE to divide the number of elements
+ assert num_elems % self.pe == 0, "PE must divide last axis"
+ # Folding along the last dimension
+ return *num_inputs, num_elems // self.pe, self.pe
+ # For broadcast axes return the non-folded shape with dummy axis
+ # inserted
+ return *num_inputs, 1, num_elems
+
+ # Gets the shape of the output at index ind with folding
+ def get_folded_output_shape(self, ind=0):
+ # Get the normal shape before applying folding
+ *num_inputs, num_elems = self.get_normal_output_shape(ind=ind)
+ # Valid folding requires the PE to divide the number of elements
+ assert num_elems % self.pe == 0, "PE must divide last axis"
+ # Folding along the last dimension
+ return *num_inputs, num_elems // self.pe, self.pe
+
+ # Widths of the input data stream of the input at index ind
+ def get_instream_width(self, ind=0):
+ # Get the number of bits used to represent the input
+ i_bits = self.get_input_datatype(ind).bitwidth()
+ # Parallelism is the number of elements in the last dimension of the
+ # folded input
+ *_, elems = self.get_folded_input_shape(ind)
+ # Width of a stream receiving input elements in parallel
+ return elems * i_bits
+
+ # Widths of the output data stream of the output at index ind
+ def get_outstream_width(self, ind=0):
+ # Get the number of bits used to represent the output
+ o_bits = self.get_output_datatype(ind).bitwidth()
+ # Parallelism is the number of elements in the last dimension of the
+ # folded output
+ *_, elems = self.get_folded_output_shape(ind)
+ # Width of a stream producing output elements in parallel
+ return elems * o_bits
+
+ # Gets the number of expected output values, i.e. how many times read()
+ # could/should be called on any output stream of this operator
+ def get_number_output_values(self):
+ # Elements over all but the last dimension of the output folded along
+ # the embedding dimension.
+ return np.prod(self.get_folded_output_shape()[:-1])
+
+ # Minimizes the width of the accumulator data type, 'accumulator width' here
+ # due to convention, it is actually the output data type
+ def minimize_accumulator_width(self, model: ModelWrapper):
+ # If any of the inputs is not an integer, the bit-width cannot be
+ # minimized
+ if not all([self.lhs_dtype.is_integer(), self.rhs_dtype.is_integer()]):
+ # Check the annotated tensor data type corresponds to the stored
+ # attribute
+ assert (model.get_tensor_datatype(self.onnx_node.output[0])
+ == self.out_dtype), \
+ f"Output type mismatch for {self.onnx_node.name}"
+ # Exit here, returning the not-minimized data type
+ return self.out_dtype
+ # Call the output type derivation specialized by the concrete operator
+ # implementation
+ out_dtype = self._derive_out_dtype(model)
+ # Set the new output data type as attribute
+ self.set_nodeattr("out_dtype", out_dtype.name)
+ # Annotate the output tensor with the new data type
+ model.set_tensor_datatype(self.onnx_node.output[0], out_dtype)
+ # Return the minimized output data type
+ # Note: Probably not required by MinimizeAccumulatorWidth transformation
+ return out_dtype
+
+ # Derives the optimal width of the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Depends on the actual operation performed and must be specialized by
+ # the concrete implementations
+ raise NotImplementedError(
+ f"_derive_out_dtype of {self.__class__.__name__}"
+ f" is not implemented!"
+ )
+
+ # Minimizes the width of the weight data type, 'weight' here due to
+ # convention, it actually applies to any constant initializer input
+ def minimize_weight_bit_width(self, model: ModelWrapper):
+ # Check for an initializer providing the left hand side input
+ lhs = model.get_initializer(self.onnx_node.input[0])
+ # weight bitwidth minimization doesn't make sense for float inputs
+ # so we'll skip those (at least until we have minifloat support)
+ old_lhs_dt = model.get_tensor_datatype(self.onnx_node.input[0])
+ # TODO move const bitwidth minimization to a utility function + reuse
+ # If the left hand side input is provided as initializer, minimize the
+ # bits used for storing this
+ if lhs is not None and old_lhs_dt.is_integer():
+ # Remember the "style" of receiving the input for further code
+ # generation
+ self.set_nodeattr("lhs_style", "const")
+ # Minimum and maximum "weight" on the left hand side, determining
+ # the range of values which needs to be represented
+ _min = lhs.min()
+ _max = lhs.max()
+ # Determine whether signed or unsigned type is required for
+ # representing the weights and select the largest "signed magnitude"
+ _mag = _max if _min > 0 else \
+ _min if (abs(_min) > _max) else (-_max - 1)
+ # Smallest data type large enough to represent this range of values
+ dtype = DataType.get_smallest_possible(_mag)
+ # Update the corresponding data type attribute of the node
+ self.set_nodeattr("lhs_dtype", dtype.name)
+ # Annotate the tensor with the new data type
+ model.set_tensor_datatype(self.onnx_node.input[0], dtype)
+
+ # Check for an initializer providing the right hand side input
+ rhs = model.get_initializer(self.onnx_node.input[1])
+ old_rhs_dt = model.get_tensor_datatype(self.onnx_node.input[1])
+ # If the right hand side input is provided as initializer, minimize the
+ # bits used for storing this
+ if rhs is not None and old_rhs_dt.is_integer():
+ # Remember the "style" of receiving the input for further code
+ # generation
+ self.set_nodeattr("rhs_style", "const")
+ # Minimum and maximum "weight" on the right hand side, determining
+ # the range of values which needs to be represented
+ _min = rhs.min()
+ _max = rhs.max()
+ # Determine whether signed or unsigned type is required for
+ # representing the weights and select the largest "signed magnitude"
+ _mag = _max if _min > 0 else \
+ _min if (abs(_min) > _max) else (-_max - 1)
+ # Smallest data type large enough to represent this range of values
+ dtype = DataType.get_smallest_possible(_mag)
+ # Update the corresponding data type attribute of the node
+ self.set_nodeattr("rhs_dtype", dtype.name)
+ # Annotate the tensor with the new data type
+ model.set_tensor_datatype(self.onnx_node.input[1], dtype)
+
+ # TODO: MVAU returns the data type here, which does not make sense for
+ # potentially two data types changing and apparently, the
+ # MinimizeWeightBitWidth transformations does not even use the returned
+ # value.
+
+ # Derives the expected cycles for the elementwise binary operation given the
+ # folding configuration
+ def get_exp_cycles(self):
+ # Number of iterations required to process the whole folded input stream
+ # Note: This is all but the PE (last, parallelized) dimension
+ return np.prod(self.get_folded_output_shape()[:-1])
+
+
+# Derive a specialization to implement elementwise addition of two inputs
+@register_custom_op
+class ElementwiseAdd(ElementwiseBinaryOperation):
+ # Specialize to implement the addition operation of left hand side and right
+ # hand side input
+ _operation = "Add", np.add, "({0} + {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs and the larger of the
+ # two widths
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ max_width = max(lhs_width, rhs_width)
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # By default, the output is one bit more than the widest of the inputs
+ out_width = max_width + 1
+ # If the addition is signed, the output might be wider depending on
+ # which of the inputs is signed
+ if signed:
+ # Find the wider and narrower of the two inputs by assuming left to
+ # right order first
+ wider, narrower = self.lhs_dtype, self.rhs_dtype
+ # Swap if the order is not correct
+ if narrower.bitwidth() > wider.bitwidth():
+ wider, narrower = narrower, wider
+ # If and only if the wider is unsigned and the narrower is signed,
+ # add two bits to the output width
+ if not wider.signed() and narrower.signed():
+ # Out has two bits more than the widest input
+ out_width = max_width + 2
+ # The new output type is a signed integer of the calculated
+ # bit-width
+ return DataType[f"INT{out_width}"]
+ # By default, if both inputs are unsigned, the output is unsigned as
+ # well
+ return DataType[f"UINT{out_width}"]
+
+
+# Derive a specialization to implement elementwise subtraction of two inputs
+@register_custom_op
+class ElementwiseSub(ElementwiseBinaryOperation):
+ # Specialize to implement the subtraction operation of left hand side and
+ # right hand side input
+ _operation = "Sub", np.subtract, "({0} - {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs and the larger of the
+ # two widths
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ max_width = max(lhs_width, rhs_width)
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # By default, the output is one bit more than the widest of the inputs
+ out_width = max_width + 1
+ # If the operation is signed, the output might be wider depending on
+ # which of the inputs is signed
+ if signed:
+ # Find the wider and narrower of the two inputs by assuming left to
+ # right order first
+ wider, narrower = self.lhs_dtype, self.rhs_dtype
+ # Swap if the order is not correct
+ if narrower.bitwidth() > wider.bitwidth():
+ wider, narrower = narrower, wider
+ # If and only if the wider is unsigned and the narrower is signed,
+ # add two bits to the output width
+ if not wider.signed() and narrower.signed():
+ # Out has two bits more than the widest input
+ out_width = max_width + 2
+ # For subtraction, the output data type is always signed
+ return DataType[f"INT{out_width}"]
+
+
+# Derive a specialization to implement elementwise multiplication of two inputs
+@register_custom_op
+class ElementwiseMul(ElementwiseBinaryOperation):
+ # Specialize to implement the multiplication operation of left hand side and
+ # right hand side input
+ _operation = "Mul", np.multiply, "({0} * {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # The width of the product is the sum of the widths of the operands.
+ out_width = lhs_width + rhs_width
+ # The product is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# Derive a specialization to implement elementwise division of two inputs
+@register_custom_op
+class ElementwiseDiv(ElementwiseBinaryOperation):
+ # TODO: Not tested due to divide by zero from randomly generated inputs...
+ # Specialize to implement the division operation of left hand side and
+ # right hand side input
+ _operation = "Div", np.divide, "({0} / {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs
+ lhs_width = self.lhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # The width of the quotient is the width of the dividend if the divisor
+ # is an unsigned type. Otherwise, it is the width of the dividend plus
+ # one.
+ out_width = lhs_width if not self.rhs_dtype.signed() else lhs_width + 1
+ # The quotient is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# TODO: ElementwiseMod - Requires extra attribute selecting the function
+
+
+# Derive a specialization to implement elementwise logical and of two inputs
+@register_custom_op
+class ElementwiseAnd(ElementwiseBinaryOperation):
+ # Specialize to implement the logical and operation of left hand side and
+ # right hand side input
+ _operation = "And", np.logical_and, "({0} && {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise logical or of two inputs
+@register_custom_op
+class ElementwiseOr(ElementwiseBinaryOperation):
+ # Specialize to implement the logical or operation of left hand side and
+ # right hand side input
+ _operation = "Or", np.logical_or, "({0} || {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise logical xor of two inputs
+@register_custom_op
+class ElementwiseXor(ElementwiseBinaryOperation):
+ # Specialize to implement the logical xor operation of left hand side and
+ # right hand side input
+ _operation = "Xor", np.logical_xor, "(bool({0}) != bool({1}))", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise equality of two inputs
+@register_custom_op
+class ElementwiseEqual(ElementwiseBinaryOperation):
+ # Specialize to implement the logical equal operation of left hand side and
+ # right hand side input
+ _operation = "Equal", np.equal, "({0} == {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise less of two inputs
+@register_custom_op
+class ElementwiseLess(ElementwiseBinaryOperation):
+ # Specialize to implement the logical less operation of left hand side and
+ # right hand side input
+ _operation = "Less", np.less, "({0} < {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise less or equal of two inputs
+@register_custom_op
+class ElementwiseLessOrEqual(ElementwiseBinaryOperation):
+ # Specialize to implement the logical less or equal operation of left hand
+ # side and right hand side input
+ _operation = "LessOrEqual", np.less_equal, "({0} <= {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise greater of two inputs
+@register_custom_op
+class ElementwiseGreater(ElementwiseBinaryOperation):
+ # Specialize to implement the logical greater operation of left hand side
+ # and right hand side input
+ _operation = "Greater", np.greater, "({0} > {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise greater or equal of two
+# inputs
+@register_custom_op
+class ElementwiseGreaterOrEqual(ElementwiseBinaryOperation):
+ # Specialize to implement the logical greater or equal operation of left
+ # hand side and right hand side input
+ _operation = "GreaterOrEqual", np.greater_equal, "({0} >= {1})", None
+
+ # Derives the output data type
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Treat the boolean output of a logical operation as unsigned integer of
+ # width 1, i.e., a single bit True/False
+ return DataType["BINARY"]
+
+
+# Derive a specialization to implement elementwise bitwise and of two inputs
+@register_custom_op
+class ElementwiseBitwiseAnd(ElementwiseBinaryOperation):
+ # Specialize to implement the bitwise and operation of left hand side and
+ # right hand side input
+ _operation = "BitwiseAnd", np.bitwise_and, "({0} & {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs # noqa: Duplicate
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # The bitwise logical operators all return a value with a width that is
+ # the maximum of the widths of the two operands.
+ out_width = max(lhs_width, rhs_width)
+ # The product is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# Derive a specialization to implement elementwise bitwise or of two inputs
+@register_custom_op
+class ElementwiseBitwiseOr(ElementwiseBinaryOperation):
+ # Specialize to implement the bitwise or operation of left hand side and
+ # right hand side input
+ _operation = "BitwiseOr", np.bitwise_or, "({0} | {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs # noqa: Duplicate
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # The bitwise logical operators all return a value with a width that is
+ # the maximum of the widths of the two operands.
+ out_width = max(lhs_width, rhs_width)
+ # The product is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# Derive a specialization to implement elementwise bitwise xor of two inputs
+@register_custom_op
+class ElementwiseBitwiseXor(ElementwiseBinaryOperation):
+ # Specialize to implement the bitwise xor operation of left hand side and
+ # right hand side input
+ _operation = "BitwiseXor", np.bitwise_xor, "({0} ^ {1})", None
+
+ # Derives the output data type according to UG1399
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # Get the width of the data types of the inputs # noqa: Duplicate
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # The bitwise logical operators all return a value with a width that is
+ # the maximum of the widths of the two operands.
+ out_width = max(lhs_width, rhs_width)
+ # The product is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# Derive a specialization to implement elementwise maximum of two inputs
+@register_custom_op
+class ElementwiseMaximum(ElementwiseBinaryOperation):
+ _operation = "Maximum", np.maximum, "({0} >= {1} ? {0} : {1})", None
+
+ def _derive_out_dtype(self, model: ModelWrapper):
+ if (not self.lhs_dtype.is_integer()) or (not self.rhs_dtype.is_integer()):
+ # if any of the inputs are float, make the output float as well
+ # TODO better float dtype resolution? (fp16 also possible)
+ return DataType["FLOAT32"]
+ else:
+ # Get the width of the data types of the inputs # noqa: Duplicate
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # use the greater of the two input bitwidths for the output
+ out_width = max(lhs_width, rhs_width)
+ # The product is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# Derive a specialization to implement elementwise minimum of two inputs
+@register_custom_op
+class ElementwiseMinimum(ElementwiseBinaryOperation):
+ _operation = "Minimum", np.minimum, "({0} <= {1} ? {0} : {1})", None
+
+ def _derive_out_dtype(self, model: ModelWrapper):
+ if (not self.lhs_dtype.is_integer()) or (not self.rhs_dtype.is_integer()):
+ # if any of the inputs are float, make the output float as well
+ # TODO better float dtype resolution? (fp16 also possible)
+ return DataType["FLOAT32"]
+ else:
+ # Get the width of the data types of the inputs # noqa: Duplicate
+ lhs_width = self.lhs_dtype.bitwidth()
+ rhs_width = self.rhs_dtype.bitwidth()
+ # Check whether the addition operation is a signed addition
+ signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+ # use the greater of the two input bitwidths for the output
+ out_width = max(lhs_width, rhs_width)
+ # The product is treated as a signed type if either of the operands is
+ # of a signed type.
+ return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
+
+# reference function for Python exec
+# note that the y argument is ignored, but needed
+# to make this pass as a binary op
+def float2int(x, y, bitwidth, narrow, signed):
+ min_val = min_int(signed, narrow, bitwidth)
+ max_val = max_int(signed, narrow, bitwidth)
+ x_rounded = np.round(x)
+ x_clipped = np.clip(x_rounded, min_val, max_val)
+ return x_clipped
+
+
+# TODO this is not really a binary op: it could be treated as unary (w/ attributes)
+# or as ternary (if we take in the min/max values as inputs)
+# Derive a specialization to implement elementwise conversion of float values
+# to integers of a particular specification (bitwidth, signedness, narrow_range)
+@register_custom_op
+class ElementwiseFloat2Int(ElementwiseBinaryOperation):
+
+ # Defines attributes which must be present on this node
+ def get_nodeattr_types(self):
+ # Start from parent operator class attributes
+ attrs = ElementwiseBinaryOperation.get_nodeattr_types(self)
+ # Update attributes dictionary for new custom operator
+ attrs.update({
+ # Bitwidth of output integers
+ "bitwidth": ("i", True, 0),
+ # Whether output integers are signed or unsigned
+ "signed": ("i", True, 0),
+ # Whether output integers use narrow-range
+ "narrow": ("i", True, 0),
+ # The rounding mode, which is used for the quant function
+ "rounding_mode": ("s", True, "ROUND"),
+ })
+ # Return updated attribute dictionary
+ return attrs
+
+ # since we use attributes to drive part of the function inputs,
+ # we cannot statically assign _operation like other subclasses
+ # instead, we override the properties accessed for codegen
+
+ @property
+ def npy_op(self) -> np.ufunc:
+ bitwidth = self.get_nodeattr("bitwidth")
+ signed = self.get_nodeattr("signed")
+ narrow = self.get_nodeattr("narrow")
+ return partial(float2int, bitwidth=bitwidth, narrow=narrow, signed=signed)
+
+ # C++ operation template available as property
+ @property
+ def cpp_op(self) -> str:
+ bitwidth = self.get_nodeattr("bitwidth")
+ signed = self.get_nodeattr("signed")
+ narrow = self.get_nodeattr("narrow")
+ min_val = min_int(signed, narrow, bitwidth)
+ max_val = max_int(signed, narrow, bitwidth)
+ return "clip(hls::round({0}), %d, %d)" % (min_val, max_val)
+
+ # RTL operation template available as property
+ @property
+ def rtl_op(self) -> str:
+ return None
+
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # the attributes decide the output datatype
+ bitwidth = self.get_nodeattr("bitwidth")
+ signed = self.get_nodeattr("signed")
+ return DataType[f"INT{bitwidth}"] if signed else DataType[f"UINT{bitwidth}"]
+
+
+# TODO this is not really a binary op: it is unary
+# Derive a specialization to implement elementwise dtype casting
+@register_custom_op
+class ElementwiseFloatCast(ElementwiseBinaryOperation):
+
+ # Defines attributes which must be present on this node
+ def get_nodeattr_types(self):
+ # Start from parent operator class attributes
+ attrs = ElementwiseBinaryOperation.get_nodeattr_types(self)
+ # Update attributes dictionary for new custom operator
+ attrs.update({
+ # Target datatype for the cast
+ "target_dtype": ("s", True, ""),
+ })
+ # Return updated attribute dictionary
+ return attrs
+
+ # since we use attributes to drive part of the function inputs,
+ # we cannot statically assign _operation like other subclasses
+ # instead, we override the properties accessed for codegen
+
+ @property
+ def npy_op(self) -> np.ufunc:
+ target_dtype = DataType[self.get_nodeattr("target_dtype")]
+ return partial(np.cast, dtype=target_dtype.to_numpy_dt())
+
+ # C++ operation template available as property
+ @property
+ def cpp_op(self) -> str:
+ target_dtype = DataType[self.get_nodeattr("target_dtype")]
+ return "((%s) {0})" % (target_dtype.get_hls_datatype_str())
+
+ # RTL operation template available as property
+ @property
+ def rtl_op(self) -> str:
+ return None
+
+ def _derive_out_dtype(self, model: ModelWrapper):
+ # the attributes decide the output datatype
+ target_dtype = DataType[self.get_nodeattr("target_dtype")]
+ return target_dtype
+
+# TODO: ElementwiseBitShift - Requires extra attribute selecting the direction
+
+
+# # Derive a specialization to implement elementwise power of two inputs
+# TODO: std::pow does not work for HLS types and hls::pow fails to link for some
+# reason
+# @register_custom_op
+# class ElementwisePow(ElementwiseBinaryOperation):
+# # Specialize to implement the power operation of left hand side and
+# # right hand side input
+# _operation = "Pow", np.power, "(std::pow({0}, {1}))", None
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..3fb958a99e 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -26,6 +26,37 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# The base class of all HWCustomOp specializations to HLS backend implementation
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+# The base class of all generic custom operations before specializing to either
+# HLS or RTL backend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Dictionary of HLSBackend implementations
+custom_op = dict()
+
+
+# Registers a class into the custom_op dictionary
+# Note: This must be defined first, before importing any custom op
+# implementation to avoid "importing partially initialized module" issues.
+def register_custom_op(cls):
+ # The class must actually implement HWCustomOp
+ assert issubclass(cls, HWCustomOp), f"{cls} must subclass {HWCustomOp}"
+ # The class must also implement the HLSBackend
+ assert issubclass(cls, HLSBackend), f"{cls} must subclass {HLSBackend}"
+ # Insert the class into the custom_op dictionary by its name
+ custom_op[cls.__name__] = cls # noqa: Some weird type annotation issue?
+ # Pass through the class unmodified
+ return cls
+
+
+# flake8: noqa
+# Disable linting from here, as all import will be flagged E402 and maybe F401
+
+# Import the submodule containing specializations of ElementwiseBinaryOperation
+# Note: This will automatically register all decorated classes into this domain
+import finn.custom_op.fpgadataflow.hls.elementwise_binary_hls
from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls
from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls
from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
@@ -53,8 +84,6 @@
from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls
-custom_op = dict()
-
# make sure new HLSCustomOp subclasses are imported here so that they get
# registered and plug in correctly into the infrastructure
custom_op["AddStreams_hls"] = AddStreams_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py
index a3f0e043f8..b713be14e5 100644
--- a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py
@@ -126,8 +126,12 @@ def execute_node(self, context, graph):
"{}/input_1.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {"inputs": {"in0": rtlsim_inp0, "in1": rtlsim_inp1}, "outputs": {"out": []}}
+ self.rtlsim_multi_io(sim, io_dict)
+ rtlsim_output = io_dict["outputs"]["out"]
+ super().close_rtlsim(sim)
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py
index 14efa113dd..c224cf64d4 100644
--- a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py
@@ -284,8 +284,15 @@ def execute_node(self, context, graph):
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- output = self.rtlsim(sim, inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
index 8a72ca3c6c..5bef15c66f 100644
--- a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
@@ -188,12 +188,14 @@ def execute_node(self, context, graph):
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
io_dict = {
"inputs": {"in0": inp},
"outputs": {"out": []},
}
self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
index 008fa9cee8..bf1f906b63 100644
--- a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
@@ -143,9 +143,10 @@ def execute_node(self, context, graph):
)
io_dict["inputs"]["in%d" % i] = rtlsim_inp
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
-
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
rtlsim_output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
diff --git a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py
index 4a5c02ee06..0e45ea7ef5 100644
--- a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py
@@ -387,8 +387,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py
index 56f472b9c0..df045583fc 100644
--- a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py
@@ -138,8 +138,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py
index e19149435e..a9fbe3ddf0 100644
--- a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py
@@ -148,7 +148,8 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
rtlsim_dict = {
"inputs": {"in0": rtlsim_inp},
"outputs": {},
@@ -156,6 +157,7 @@ def execute_node(self, context, graph):
for i in range(n_outputs):
rtlsim_dict["outputs"]["out%d" % i] = []
self.rtlsim_multi_io(sim, rtlsim_dict)
+ super().close_rtlsim(sim)
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
new file mode 100644
index 0000000000..28bf6026d8
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
@@ -0,0 +1,842 @@
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Numpy math and arrays
+import numpy as np
+
+# Operating system stuff, e.g. paths
+import os
+
+# Cleanup post-processing of generated code
+import textwrap
+
+# QONNX wrapper to ONNX model graphs
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+
+# Specializations of the generic HW operator
+import finn.custom_op.fpgadataflow.elementwise_binary as elementwise_binary
+
+# The generic HW custom operator version of the operator as a base class
+from finn.custom_op.fpgadataflow.elementwise_binary import ( # noqa
+ ElementwiseBinaryOperation,
+)
+
+# Utility for registering HLSBackend HWCustomOp implementations into the module
+# scope
+from finn.custom_op.fpgadataflow.hls import register_custom_op
+
+# Base class for specializing HW operators as implemented via HLS
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+# Convert and pack (numpy) data for C++ code generation
+from finn.util.data_packing import numpy_to_hls_code
+
+# Mapping of memory resource attributes to the corresponding C++ HLS
+# pragma directives
+RAM_STYLES = {
+ "auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", "ultra": "URAM"
+}
+
+
+# HLS Backend specialization of the binary elementwise operation operator
+class ElementwiseBinaryOperation_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation, HLSBackend
+):
+ # Node attributes matching the HLS operator
+ def get_nodeattr_types(self):
+ # Start from parent operator class attributes
+ attrs = ElementwiseBinaryOperation.get_nodeattr_types(self)
+ # Add the HLSBackend default attributes on top
+ attrs.update(HLSBackend.get_nodeattr_types(self))
+ # Add/Specialize implementation specific attributes here...
+ # Return the updated attributes dictionary
+ return attrs
+
+ # Executes elementwise operation in C++ simulation
+ def _execute_node_cppsim(self, context, graph): # noqa: graph unused
+ # Get the node wrapped by this custom op
+ node = self.onnx_node
+ # Input data is stored in numpy files in the code generation dictionary
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ # Get the inputs out of the execution context
+ lhs = context[node.input[0]] # noqa: Duplicate code prepare simulation
+ rhs = context[node.input[1]]
+ # Validate the shape of the inputs
+ assert list(lhs.shape) == self.get_normal_input_shape(ind=0), \
+ f"Input shape mismatch for {node.input[0]}"
+ assert list(rhs.shape) == self.get_normal_input_shape(ind=1), \
+ f"Input shape mismatch for {node.input[1]} {rhs.shape=}"
+ # Reshape the inputs into folded form
+ lhs = lhs.reshape(self.get_folded_input_shape(ind=0))
+ rhs = rhs.reshape(self.get_folded_input_shape(ind=1))
+ # Save the folded inputs to file to be used by simulation
+ np.save(os.path.join(code_gen_dir, "lhs.npy"), lhs)
+ np.save(os.path.join(code_gen_dir, "rhs.npy"), rhs)
+
+ # Execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+
+ # Load the output numpy file generated by the C++ simulation
+ out = np.load(os.path.join(code_gen_dir, "out.npy"))
+ # Reshape the folded output and insert into the execution context
+ context[node.output[0]] = out.reshape(
+ self.get_normal_output_shape(ind=0)
+ )
+
+ # Maximum width of any ap_int used in this operator
+ def get_ap_int_max_w(self):
+ # Find the widths of the widest of the two inputs
+ i_bits_max = max(
+ self.get_instream_width(ind=0),
+ self.get_instream_width(ind=1)
+ )
+ # Width of the output, there is just one output
+ # Note: there is one output per replica
+ o_bits_max = self.get_outstream_width(ind=0)
+ # Find the biggest of the inputs/outputs
+ return max([i_bits_max, o_bits_max])
+
+ # Note: End of shape and datatype utilities
+
+ # Generates list of C++ includes to be placed at the top of the generated
+ # code
+ def global_includes(self):
+ # Currently nothing to include
+ self.code_gen_dict["$GLOBALS$"] = ['#include "flatten.hpp"']
+
+ # Generates C++ parameters file, i.e., constant initializer inputs
+ def generate_params(self, model: ModelWrapper, path: str):
+ # The code generation directory is specified as an argument, so this
+ # will work for both RTL and C++ simulation
+ code_gen_dir = path
+ # By default, assume runtime inputs not requiring code to be generated
+ lhs_code = rhs_code = ""
+ # Check for an initializer providing the left hand side input
+ lhs = model.get_initializer(self.onnx_node.input[0])
+ # Folded output shape for broadcasting/aligning the input shapes
+ out_shape = self.get_folded_output_shape(ind=0)
+ # Type of memory to use for storing constant parameters
+ ram_style = RAM_STYLES[self.get_nodeattr("ram_style")]
+
+ # Check whether there are already pragmas in the code generation
+ # dictionary
+ if "$PRAGMAS$" not in self.code_gen_dict:
+ # If not, insert an empty list to collect more pragmas
+ # Note: Do this here as it is easier to add the array partition and
+ # bind storage pragmas for generated parameter here, where the shape
+ # is computed.
+ self.code_gen_dict["$PRAGMAS$"] = []
+
+ # If the left hand side input is provided as initializer, generate
+ # initializer parameters code
+ if lhs is not None:
+ # Remember the "style" of receiving the input for further code
+ # generation
+ self.set_nodeattr("lhs_style", "const")
+ # Reshape the parameter tensor into folded shape
+ lhs = lhs.reshape(*self.get_folded_input_shape(ind=0))
+ # Need to make sure there are PE many elements which can be accessed
+ # in parallel
+ if lhs.shape[-1] != self.pe: # noqa: Duplicate
+ # Broadcast the parameter tensor "offline" to have PE elements
+ # TODO: This replicates all parameters and might be inefficient
+ # in terms of memory utilization. It might be ore efficient to
+ # replicate the PEs when needed in docompute, probably at the
+ # cost of some latency for extra reads and registers.
+ lhs = np.broadcast_to(lhs, lhs.shape[:-1] + (self.pe,))
+ # Current, maybe non-aligned input shape
+ lhs_shape = lhs.shape
+ # Fill up shape from the left to match the broadcast output shape
+ lhs_shape = (len(out_shape) - len(lhs_shape)) * (1,) + lhs_shape
+ # Reshape the input to align with the output shape
+ lhs = lhs.reshape(*lhs_shape)
+ # Generate C++ array initialization code
+ # Note: no packing, but with variable name/type declaration
+ lhs_code = numpy_to_hls_code(
+ lhs, self.lhs_dtype, "lhs", False, False
+ )
+ # Add pragma configuring the storage type to use for the parameter
+ # tensors: This is a constant parameter implemented as dual-port ROM
+ self.code_gen_dict["$PRAGMAS$"].append(
+ f"#pragma HLS BIND_STORAGE"
+ f" variable=lhs type=ROM_2P impl={ram_style}"
+ )
+ # Add pragma to partition the parameter tensor along the last
+ # dimensions, i.e., the PE dimension for parallel access
+ self.code_gen_dict["$PRAGMAS$"].append(
+ f"#pragma HLS ARRAY_PARTITION"
+ f" variable=lhs complete dim={len(lhs_shape)}"
+ )
+
+ # Check for an initializer providing the right hand side input
+ rhs = model.get_initializer(self.onnx_node.input[1])
+ # If the right hand side input is provided as initializer, generate
+ # initializer parameters code
+ if rhs is not None:
+ # Remember the "style" of receiving the input for further code
+ # generation
+ self.set_nodeattr("rhs_style", "const")
+ # Reshape the parameter tensor into folded shape
+ rhs = rhs.reshape(*self.get_folded_input_shape(ind=1))
+ # Need to make sure there are PE many elements which can be accessed
+ # in parallel
+ if rhs.shape[-1] != self.pe: # noqa: Duplicate
+ # Broadcast the parameter tensor "offline" to have PE elements
+ # TODO: This replicates all parameters and might be inefficient
+ # in terms of memory utilization. It might be ore efficient to
+ # replicate the PEs when needed in docompute, probably at the
+ # cost of some latency for extra reads and registers.
+ rhs = np.broadcast_to(rhs, rhs.shape[:-1] + (self.pe,))
+ # Current, maybe non-aligned input shape
+ rhs_shape = rhs.shape
+ # Fill up shape from the left to match the broadcast output shape
+ rhs_shape = (len(out_shape) - len(rhs_shape)) * (1,) + rhs_shape
+ # Reshape the input to align with the output shape
+ rhs = rhs.reshape(*rhs_shape)
+ # Generate C++ array initialization code
+ # Note: no packing, but with variable name/type declaration
+ rhs_code = numpy_to_hls_code(
+ rhs, self.rhs_dtype, "rhs", False, False
+ )
+ # Add pragma configuring the storage type to use for the parameter
+ # tensors: This is a constant parameter implemented as dual-port ROM
+ self.code_gen_dict["$PRAGMAS$"].append(
+ f"#pragma HLS BIND_STORAGE"
+ f" variable=rhs type=ROM_2P impl={ram_style}"
+ )
+ # Add pragma to partition the parameter tensor along the last
+ # dimensions, i.e., the PE dimension for parallel access
+ self.code_gen_dict["$PRAGMAS$"].append(
+ f"#pragma HLS ARRAY_PARTITION"
+ f" variable=rhs complete dim={len(rhs_shape)}"
+ )
+
+ # Open a file to store the thresholds parameters as C++ code
+ with open(f"{code_gen_dir}/params.hpp", "w") as file:
+ # Write lines of C++ code separated by newlines to the file
+ file.write("\n".join([
+ # Insert left-hand-side and right-hand-side parameter code and
+ # append a newline at the end of the file (to avoid problems
+ # when including, required by C standard?)
+ lhs_code, rhs_code, "\n"
+ ]))
+
+ # Generates C++ code of type alias, global constant and macro definitions
+ def defines(self, var):
+ # Insert constants and type aliases into the dictionary
+ self.code_gen_dict["$DEFINES$"] = [
+ # Input and output element datatypes
+ f"using LhsType = {self.lhs_dtype.get_hls_datatype_str()};",
+ f"using RhsType = {self.rhs_dtype.get_hls_datatype_str()};",
+ f"using OutType = {self.out_dtype.get_hls_datatype_str()};",
+ # Width of single elements to avoid using ::width attribute which is
+ # not present for datatype float
+ f"static constexpr auto LhsWidth = {self.lhs_dtype.bitwidth()};",
+ f"static constexpr auto RhsWidth = {self.rhs_dtype.bitwidth()};",
+ f"static constexpr auto OutWidth = {self.out_dtype.bitwidth()};",
+ # Datatype of elements packed into the input stream
+ f"using LhsPacked = ap_uint<{self.get_instream_width(ind=0)}>;",
+ f"using RhsPacked = ap_uint<{self.get_instream_width(ind=1)}>;",
+ # Datatype of elements packed into the output stream
+ f"using OutPacked = ap_uint<{self.get_outstream_width(ind=0)}>;",
+ # Include the activation function type definitions and parameters
+ # Note: The typedefs in this header require the typedefs above,
+ # thus adding this to the global includes is not possible.
+ '#include "params.hpp"',
+ # Input and output HLS stream datatypes
+ "using LhsStream = hls::stream;",
+ "using RhsStream = hls::stream;",
+ "using OutStream = hls::stream;",
+ ]
+
+ # Generates C++ code for reading data from .npy (numpy format) for testing
+ # in C++ simulation
+ def read_npy_data(self):
+ # Input data is stored in numpy files in the code generation dictionary
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ # Prepare empty stream reading to append optionals
+ self.code_gen_dict["$READNPYDATA$"] = []
+ # If the left-hand-side is provided as runtime input, read code needs
+ # to be generated
+ if self.lhs_style == "input":
+ lhs_carrier_dtype = "half" if self.lhs_dtype == DataType["FLOAT16"] else "float"
+ # Generate function calls for reading the input files into the input
+ # streams
+ self.code_gen_dict["$READNPYDATA$"] += [
+ # Generate function call reading from file into the input stream
+ f'npy2apintstream(',
+ f'"{code_gen_dir}/lhs.npy", lhs_{self.hls_sname()}, false',
+ ');'
+ ]
+ # If the right-hand-side is provided as runtime input, read code needs
+ # to be generated
+ if self.rhs_style == "input":
+ # Generate function calls for reading the input files into the input
+ # streams
+ rhs_carrier_dtype = "half" if self.rhs_dtype == DataType["FLOAT16"] else "float"
+ self.code_gen_dict["$READNPYDATA$"] += [
+ # Generate function call reading from file into the input stream
+ # Note: Inputs are always represented as numpy floats
+ f'npy2apintstream(',
+ f'"{code_gen_dir}/rhs.npy", rhs_{self.hls_sname()}, false',
+ ');'
+ ]
+
+ # Generates C++ code for declaring all streams involved in C++ simulation
+ # for testing
+ def strm_decl(self):
+ # Allways add the output stream to the declarations
+ self.code_gen_dict["$STREAMDECLARATIONS$"] = [
+ # Note: Assumes stream type aliases to be set in defines
+ f"OutStream out_{self.hls_sname()};"
+ ]
+ # If the left-hand-side is provided as runtime input, read code needs
+ # to be generated
+ if self.lhs_style == "input":
+ # Generate a stream declaration
+ self.code_gen_dict["$STREAMDECLARATIONS$"] += [
+ # Note: Assumes stream type aliases to be set in defines
+ f"LhsStream lhs_{self.hls_sname()};"
+ ]
+ # If the right-hand-side is provided as runtime input, read code needs
+ # to be generated
+ if self.rhs_style == "input":
+ # Generate a stream declaration
+ self.code_gen_dict["$STREAMDECLARATIONS$"] += [
+ # Note: Assumes stream type aliases to be set in defines
+ f"RhsStream rhs_{self.hls_sname()};"
+ ]
+
+ # Generates C++ code for calling the computation part of the operator
+ def docompute(self):
+ # Add padding ones to a shape to match the broadcast output shape
+ def pad_shape(shape):
+ return (len(out_shape) - len(shape)) * (1,) + shape
+
+ # Get the folded shapes of all tensors involved without PE axis
+ lhs_shape = self.get_folded_input_shape(ind=0)[:-1]
+ rhs_shape = self.get_folded_input_shape(ind=1)[:-1]
+ out_shape = self.get_folded_output_shape(ind=0)[:-1]
+ # Expanded shape of the inputs, filling with dimensions of size 1 from
+ # the left to align the shape with the broadcast shape
+ lhs_shape = pad_shape(lhs_shape)
+ rhs_shape = pad_shape(rhs_shape)
+
+ # Removes contiguous matching dimensions from a shape
+ def drop_matching_dims(shape, like):
+ # Core functionality for this is implemented in itertools
+ from itertools import dropwhile
+
+ # Compare shapes from left to right removing dimensions as long as
+ # they match
+ return *[
+ size for size, _ in dropwhile(
+ lambda x: x[0] == x[1], zip(shape, like)
+ )
+ ],
+
+ # Take away all contiguous dimensions where these align with the output
+ # shape, as these can be consumed directly without buffering to be
+ # repeated
+ lhs_buffer_shape = drop_matching_dims(lhs_shape, out_shape)
+ rhs_buffer_shape = drop_matching_dims(rhs_shape, out_shape)
+ # Expand once again, filling with dimensions of size 1 from the left to
+ # align the shape with the broadcast shape
+ lhs_buffer_shape = pad_shape(lhs_buffer_shape)
+ rhs_buffer_shape = pad_shape(rhs_buffer_shape)
+
+ # Code generation of array index strings with broadcasting
+ def make_index_string(shape):
+ # Generate index operation [i] for "normal" dimensions but reduce to
+ # hardcoded [0] for broadcast dimensions to repeat from a single
+ # buffer slot
+ return "".join([
+ f"[i{d}]" if s != 1 else "[0]" for d, s in enumerate(shape)
+ ])
+
+ # Generate the C++ code for indexing the buffers
+ lhs_index = {
+ "input": make_index_string(lhs_buffer_shape),
+ "const": make_index_string(lhs_shape)
+ }[self.lhs_style]
+ rhs_index = {
+ "input": make_index_string(rhs_buffer_shape),
+ "const": make_index_string(rhs_shape)
+ }[self.rhs_style]
+
+ # Generate C++ code for declaring an array of the buffer shapes
+ lhs_buffer_shape = "".join([f'[{size}]' for size in lhs_buffer_shape])
+ rhs_buffer_shape = "".join([f'[{size}]' for size in rhs_buffer_shape])
+
+ # Number of dimensions of the (broadcast) output. All shapes will be
+ # aligned to this number of dimensions.
+ # Note: +1 for the PE dimension
+ ndim = len(out_shape) + 1
+
+ # For-Loop template for nested loops over arbitrary many levels
+ def for_loop(level, size):
+ return f"for(std::size_t i{level} = 0; i{level}<{size}; ++i{level})"
+
+ # Generate code testing for the condition when the next element needs to
+ # be read from the input stream according to broadcasting semantics
+ def read_stream_condition(shape):
+ # Start with the assumption that none of the dimensions is
+ # broadcast, meaning each individual element needs to be read from
+ # the stream
+ condition = "true"
+ # Search for the dimensions which are broadcast
+ for dim, size in enumerate(shape):
+ # If this dimension has a size of 1 in the input but not in the
+ # output, it is broadcast and contributes to the conjunctive
+ # reading condition if this index wraps around
+ if size == 1 and out_shape[dim] != 1:
+ # Add testing for index wrap-around to the condition
+ condition += f" && (i{dim} == 0)"
+ # Return the composed reading condition
+ return condition
+
+ # Generate code for unpacking elements read from the stream into the PE-
+ # parallel buffer according to broadcasting semantics
+ def unpack_buffer(shape):
+ # Unpacking behavior depends on whether the last, i.e., folded PE
+ # dimension is broadcast
+ if shape[-1] == 1 and self.pe != self.out_shape[-1]:
+ # PE axis is broadcast, i.e., slice yields just one element
+ # which needs to be replicated
+ return "buffer(0, 0)"
+ # PE axis is not broadcast, i.e., slice actually yields parallel
+ # elements to be unpacked
+ return "buffer(pe, 0)"
+
+ # Type of memory to use for storing constant parameters
+ ram_style = RAM_STYLES[self.get_nodeattr("ram_style")]
+
+ # Write the body of the top-level function
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ # @formatter:off Disable formatter for mixed Python and C++
+ # For streamed inputs, generate local buffer of non-broadcast size
+ # but broadcasts dimensions un-squeezed to size 1. For constant
+ # inputs, use the generated parameters of the same name.
+ # For streamed inputs, implement a simple dual-port RAM partitioned
+ # on the last, i.e., the PE, axis for parallel access.
+ f"""
+ LhsType lhs{lhs_buffer_shape}[{self.pe}];
+ #pragma HLS ARRAY_PARTITION variable=lhs complete dim={ndim}
+ #pragma HLS BIND_STORAGE variable=lhs type=RAM_S2P impl={ram_style}
+ """ if self.lhs_style == "input" else """""",
+ f"""
+ RhsType rhs{rhs_buffer_shape}[{self.pe}];
+ #pragma HLS ARRAY_PARTITION variable=rhs complete dim={ndim}
+ #pragma HLS BIND_STORAGE variable=rhs type=RAM_S2P impl={ram_style}
+ """ if self.rhs_style == "input" else """""",
+ # Buffer to hold the parallel output elements: Implement a simple
+ # dual-port RAM for the output buffer, partitioned on the last,
+ # i.e., the PE, axis for parallel access.
+ # Note: The PE output should be rather small, force this into
+ # distributed memory here.
+ # TODO: Maybe reconsider this later?
+ f"""
+ OutType out[{self.pe}];
+ #pragma HLS ARRAY_PARTITION variable=out complete dim=1
+ #pragma HLS BIND_STORAGE variable=out type=RAM_S2P impl=LUTRAM
+ """,
+ # Perfect loop nest over all folded output dimensions
+ *[for_loop(dim, size) + " {" for dim, size in enumerate(out_shape)],
+ # Pipeline the loops. This should be possible as there is no code
+ # between the loop levels, i.e., this is a perfect loop nest.
+ """
+ #pragma HLS pipeline II=1 style=flp
+ """,
+ # Read from the left-hand-side input stream if new elements are
+ # needed according to broadcasting semantics
+ f"""
+ if({read_stream_condition(lhs_shape)}) {{
+ const auto buffer = Slice{{}}(
+ lhs_{self.hls_sname()}.read()
+ );
+ for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+ #pragma HLS unroll
+ lhs{lhs_index}[pe] = {unpack_buffer(lhs_shape)};
+ }}
+ }}
+ """ if self.lhs_style == "input" else """""",
+ # Read from the right-hand-side input stream if new elements are
+ # needed according to broadcasting semantics
+ f"""
+ if({read_stream_condition(rhs_shape)}) {{
+ const auto buffer = Slice{{}}(
+ rhs_{self.hls_sname()}.read()
+ );
+ for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+ #pragma HLS unroll
+ rhs{rhs_index}[pe] = {unpack_buffer(rhs_shape)};
+ }}
+ }}
+ """ if self.rhs_style == "input" else """""",
+ # Apply PE parallel elementwise operations by filling the operation
+ # template
+ f"""
+ for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+ #pragma HLS unroll
+ out[pe] = {self.cpp_op.format(
+ f"lhs{lhs_index}[pe]", f"rhs{rhs_index}[pe]"
+ )};
+ }}
+ """,
+ # Write the PE group into the output stream
+ f"""
+ out_{self.hls_sname()}.write(flatten(out));
+ """,
+ # Close all for-loop bodies of the generated nest
+ *["}" for _ in enumerate(out_shape)]
+ # @formatter:on End of code generation
+ ]
+
+ # Post-process the generated code to remove unnecessary white space
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ textwrap.dedent(code) for code in self.code_gen_dict["$DOCOMPUTE$"]
+ ]
+
+ # Generates C++ code for reading the output stream and converting back to
+ # numpy format for testing in C** simulation
+ def dataoutstrm(self):
+ # Output data will be stored in numpy files in the code generation
+ # dictionary
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ # Get the expected shape of the folded output array formatted as a C++
+ # vector initializer
+ # Note: Valid formatting relies on correct placement of curly braces
+ # and line breaks: Open/close all three braces on the same line of code
+ # to avoid '\n' to be inserted into the string
+ shape = f"""{{{
+ ','.join((str(i) for i in self.get_folded_output_shape(ind=0)))
+ }}}"""
+ # Generate function call for reading from the output stream into the
+ # output file
+ out_carrier_dtype = "half" if self.out_dtype == DataType["FLOAT16"] else "float"
+ self.code_gen_dict["$DATAOUTSTREAM$"] = [
+ # Generate function call reading from stream into the output file
+ # Note: Outputs are always represented as numpy floats
+ f'apintstream2npy(',
+ f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false',
+ ');',
+ ]
+
+ # Generates C++ code for saving the output of C++ simulation to a file in
+ # numpy format
+ def save_as_npy(self):
+ # Note: This seems to be empty in ALL HLSBackends. Probably it was used
+ # for something before, which is now integrated into dataoutstrm()?
+ self.code_gen_dict["$SAVEASCNPY$"] = []
+
+ # Generates essentially the head of the C++ function from which the IP block
+ # will be generated during ipgen, i.e. actual synthesis
+ def blackboxfunction(self):
+ # Check whether the inputs are provided at runtime to generate stream
+ # inputs to the toplevel interface
+ runtime_lhs = self.lhs_style == "input"
+ runtime_rhs = self.rhs_style == "input"
+ # Insert function head describing the top level interface of the
+ # attention operator
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ # Note: Assumes stream type aliases to be set in defines
+ f"void {self.onnx_node.name} (",
+ f" LhsStream &lhs_{self.hls_sname()}," if runtime_lhs else "",
+ f" RhsStream &rhs_{self.hls_sname()}," if runtime_rhs else "",
+ f" OutStream &out_{self.hls_sname()}",
+ ")",
+ ]
+
+ # Generates C++ pragmas to be inserted into the main function of the C++
+ # simulation and the ipgen-blackboxfunction as well
+ def pragmas(self):
+ # Check whether there are already pragmas in the code generation
+ # dictionary
+ if "$PRAGMAS$" not in self.code_gen_dict:
+ # If not, insert an empty list to collect more pragmas
+ self.code_gen_dict["$PRAGMAS$"] = []
+
+ # Add HLS interface directives specifying how to create RTL ports for
+ # the top-level function arguments
+ self.code_gen_dict["$PRAGMAS$"] += [
+ # Connect the output stream with an axi stream interface
+ f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}",
+ ]
+
+ # If the left-hand-side is provided as runtime input interface pragmas
+ # need to be inserted
+ if self.lhs_style == "input":
+ # Connect the lhs input stream with an axi stream interface
+ self.code_gen_dict["$PRAGMAS$"] += [
+ f"#pragma HLS INTERFACE axis port=lhs_{self.hls_sname()}",
+ ]
+
+ # If the right-hand-side is provided as runtime input interface pragmas
+ # need to be inserted
+ if self.rhs_style == "input":
+ # Connect the rhs input stream with an axi stream interface
+ self.code_gen_dict["$PRAGMAS$"] += [
+ f"#pragma HLS INTERFACE axis port=rhs_{self.hls_sname()}",
+ ]
+
+ # No block-level I/O protocol for the function return value
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE ap_ctrl_none port=return"
+ )
+
+ # Returns the names of input and output interfaces grouped by protocol
+ def get_verilog_top_module_intf_names(self):
+ # Start collecting interface names in a dictionary starting with clock
+ # and reset
+ intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa
+ # AXI stream input interfaces
+ intf_names["s_axis"] = []
+ # If the left-hand-side is provided as runtime input interface names
+ # need to be inserted
+ if self.lhs_style == "input":
+ intf_names["s_axis"] += [(
+ f"lhs_{self.hls_sname()}", self.get_instream_width_padded(ind=0)
+ )]
+ # If the right-hand-side is provided as runtime input interface names
+ # need to be inserted
+ if self.rhs_style == "input":
+ intf_names["s_axis"] += [(
+ f"rhs_{self.hls_sname()}", self.get_instream_width_padded(ind=1)
+ )]
+ # AXI stream output interfaces
+ intf_names["m_axis"] = [
+ (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0))
+ ]
+ # No AXI-MM, AXI-Lite or protocol-less interfaces
+ intf_names["aximm"] = []
+ intf_names["axilite"] = []
+ intf_names["ap_none"] = []
+ # Return the interface name dictionary
+ return intf_names
+
+
+# Derive a specialization to implement elementwise addition of two inputs
+@register_custom_op # noqa: PyCharm sees all these specializations as duplicate
+class ElementwiseAdd_hls( # noqa: Class name does not follow
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseAdd
+):
+ pass
+
+
+# Derive a specialization to implement elementwise subtraction of two inputs
+@register_custom_op
+class ElementwiseSub_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseSub
+):
+ pass
+
+
+# Derive a specialization to implement elementwise multiplication of two inputs
+@register_custom_op
+class ElementwiseMul_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseMul
+):
+ pass
+
+
+# Derive a specialization to implement elementwise division of two inputs
+@register_custom_op
+class ElementwiseDiv_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseDiv
+):
+ pass
+
+
+# TODO: ElementwiseMod_hls - Requires extra attribute selecting the function
+
+# Derive a specialization to implement elementwise logical and of two inputs
+@register_custom_op
+class ElementwiseAnd_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseAnd
+):
+ pass
+
+
+# Derive a specialization to implement elementwise logical or of two inputs
+@register_custom_op
+class ElementwiseOr_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseOr
+):
+ pass
+
+
+# Derive a specialization to implement elementwise logical xor of two inputs
+@register_custom_op
+class ElementwiseXor_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseXor
+):
+ pass
+
+
+# Derive a specialization to implement elementwise equal of two inputs
+@register_custom_op # noqa: PyCharm sees all these specializations as duplicate
+class ElementwiseEqual_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseEqual
+):
+ pass
+
+
+# Derive a specialization to implement elementwise less of two inputs
+@register_custom_op
+class ElementwiseLess_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseLess
+):
+ pass
+
+
+# Derive a specialization to implement elementwise less or equal of two inputs
+@register_custom_op
+class ElementwiseLessOrEqual_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseLessOrEqual
+):
+ pass
+
+
+# Derive a specialization to implement elementwise greater of two inputs
+@register_custom_op
+class ElementwiseGreater_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseGreater
+):
+ pass
+
+
+# Derive a specialization to implement elementwise greater or equal of two
+# inputs
+@register_custom_op
+class ElementwiseGreaterOrEqual_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseGreaterOrEqual
+):
+ pass
+
+
+# Derive a specialization to implement elementwise bitwise and of two inputs
+@register_custom_op
+class ElementwiseBitwiseAnd_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseBitwiseAnd
+):
+ pass
+
+
+# Derive a specialization to implement elementwise bitwise or of two inputs
+@register_custom_op
+class ElementwiseBitwiseOr_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseBitwiseOr
+):
+ pass
+
+
+# Derive a specialization to implement elementwise bitwise xor of two inputs
+@register_custom_op
+class ElementwiseBitwiseXor_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseBitwiseXor
+):
+ pass
+
+
+# Derive a specialization to implement elementwise maximum of two inputs
+@register_custom_op
+class ElementwiseMaximum_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseMaximum
+):
+ pass
+
+
+# Derive a specialization to implement elementwise minimum of two inputs
+@register_custom_op
+class ElementwiseMinimum_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseMinimum
+):
+ pass
+
+
+# Derive a specialization to implement elementwise minimum of two inputs
+@register_custom_op
+class ElementwiseFloat2Int_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseFloat2Int
+):
+
+ # we need to resolve the attribute types due to multiple inheritence
+ def get_nodeattr_types(self):
+ # start with attributes from ElementwiseBinaryOperation
+ attrs = super(ElementwiseBinaryOperation_hls, self).get_nodeattr_types()
+ # add attributes from ElementwiseFloat2Int
+ attrs_float2int = super(elementwise_binary.ElementwiseFloat2Int, self).get_nodeattr_types()
+ attrs.update(attrs_float2int)
+ # Return updated attribute dictionary
+ return attrs
+
+ # Generates list of C++ includes to be placed at the top of the generated
+ # code
+ def global_includes(self):
+ super().global_includes()
+ # additional hls_math include to get hls::round()
+ self.code_gen_dict["$GLOBALS$"] += ['#include ']
+
+ # Generates C++ code of type alias, global constant and macro definitions
+ def defines(self, var):
+ super().defines(var)
+
+ # Define macro for clipping/saturating values
+ self.code_gen_dict["$DEFINES$"] += [
+ "#define clip_min(x, minval) (x >= minval ? x : minval)",
+ "#define clip_max(x, maxval) (x <= maxval ? x : maxval)",
+ "#define clip(x, y, z) clip_max(clip_min(x, y), z)",
+ ]
+
+
+# Derive a specialization to implement elementwise casting
+@register_custom_op
+class ElementwiseFloatCast_hls( # noqa: Class name does not follow
+ # CapWords convention
+ ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseFloatCast
+):
+
+ # we need to resolve the attribute types due to multiple inheritence
+ def get_nodeattr_types(self):
+ # start with attributes from ElementwiseBinaryOperation
+ attrs = super(ElementwiseBinaryOperation_hls, self).get_nodeattr_types()
+ # add attributes from ElementwiseFloatCast
+ attrs_cast = super(elementwise_binary.ElementwiseFloatCast, self).get_nodeattr_types()
+ attrs.update(attrs_cast)
+ # Return updated attribute dictionary
+ return attrs
+
+
+# TODO: ElementwiseBitShift_hls - Requires extra attribute selecting the
+# direction
+
+
+# # Derive a specialization to implement elementwise power of two inputs
+# TODO: std::pow does not work for HLS types and hls::pow fails to link for some
+# reason
+# @register_custom_op
+# class ElementwisePow_hls( # noqa: Class name does not follow
+# # CapWords convention
+# ElementwiseBinaryOperation_hls, elementwise_binary.ElementwisePow
+# ):
+# pass
diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py
index d57699af05..6355acba9b 100644
--- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py
@@ -185,8 +185,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py
index b7ba301fbc..a39b7e5b03 100644
--- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py
@@ -140,8 +140,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py
index 9b2a7b25b0..0d2ba2ff0b 100644
--- a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py
@@ -118,8 +118,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py
index 1e2c0d034a..19e1318205 100644
--- a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py
@@ -120,8 +120,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
index ba44deb898..98a04b0bc9 100644
--- a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
@@ -297,8 +297,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
out_npy_path = "{}/output.npy".format(code_gen_dir)
diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
index cae1c30eb6..a355445c48 100644
--- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -542,7 +542,8 @@ def execute_node(self, context, graph):
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
self.reset_rtlsim(sim)
- self.toggle_clk(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
if mem_mode == "external" or mem_mode == "internal_decoupled":
wnbits = self.get_weightstream_width()
export_wdt = self.get_weight_datatype()
@@ -556,10 +557,14 @@ def execute_node(self, context, graph):
"inputs": {"in0": inp, "weights": wei * num_w_reps},
"outputs": {"out": []},
}
- self.rtlsim_multi_io(sim, io_dict)
- output = io_dict["outputs"]["out"]
else:
- output = self.rtlsim(sim, inp)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py
index 64c6ec33f8..2918f88a81 100644
--- a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py
@@ -235,8 +235,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
index 4619a1756b..fb8ee42f5a 100644
--- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
@@ -177,8 +177,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py
index 0d618d832a..efa98f2ea6 100644
--- a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py
@@ -129,8 +129,15 @@ def execute_node(self, context, graph):
"{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp0, "in1": rtlsim_inp1},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py
index 69db7b4606..c03d9a0ece 100755
--- a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py
@@ -190,8 +190,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
index b753bc7a03..6a304de7e0 100644
--- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
@@ -297,10 +297,11 @@ def execute_node(self, context, graph):
# the second input are the weights
# the third input are the thresholds
if in_ind == 0:
- assert (
- str(context[inputs].dtype) == "float32"
- ), """Input datatype is
- not float32 as expected."""
+ assert str(context[inputs].dtype) in [
+ "float32",
+ "float16",
+ ], """Input datatype is
+ not float32 or float16 as expected."""
expected_inp_shape = self.get_folded_input_shape()
reshaped_input = context[inputs].reshape(expected_inp_shape)
if self.get_input_datatype() == DataType["BIPOLAR"]:
@@ -336,7 +337,8 @@ def execute_node(self, context, graph):
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
if self.get_nodeattr("mem_mode") == "internal_decoupled":
wnbits = self.get_weightstream_width()
export_wdt = self.get_weight_datatype()
@@ -348,12 +350,16 @@ def execute_node(self, context, graph):
"inputs": {"in0": inp, "weights": wei * num_w_reps},
"outputs": {"out": []},
}
- self.rtlsim_multi_io(sim, io_dict)
- output = io_dict["outputs"]["out"]
elif self.get_nodeattr("mem_mode") == "internal_embedded":
- output = self.rtlsim(sim, inp)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
else:
raise Exception("Unrecognized mem_mode")
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -412,7 +418,7 @@ def read_npy_data(self):
packed_bits = self.get_instream_width()
packed_hls_type = "ap_uint<%d>" % packed_bits
elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
+ npy_type = "half" if dtype == DataType["FLOAT16"] else "float"
npy_in = "%s/input_0.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"] = []
# note: the innermost dim is reversed for the input
@@ -434,7 +440,7 @@ def read_npy_data(self):
packed_bits = self.get_weightstream_width()
packed_hls_type = "ap_uint<%d>" % packed_bits
elem_hls_type = tdt.get_hls_datatype_str()
- npy_type = "float"
+ npy_type = "half" if tdt == DataType["FLOAT16"] else "float"
npy_in = "%s/thresholds.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
@@ -670,6 +676,12 @@ def code_generation_ipi(self):
"connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
% (node_name, clk_name, node_name, strm_inst)
)
+ # 2x clock is not used for decoupled thresholds
+ # simply connect input to the 1x clock for now
+ cmd.append(
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk2x]"
+ % (node_name, clk_name, node_name, strm_inst)
+ )
cmd.append(
"connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
% (node_name, rst_name, node_name, node_name, rst_name)
diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
index 05d26eddb2..0dfe9096b0 100644
--- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
@@ -148,8 +148,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
index f9ba68e6b6..455d477c88 100644
--- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
@@ -191,7 +191,8 @@ def execute_node(self, context, graph):
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
if mem_mode == "external" or mem_mode == "internal_decoupled":
wnbits = self.get_weightstream_width()
@@ -208,10 +209,14 @@ def execute_node(self, context, graph):
"inputs": {"in0": inp, "weights": wei * num_w_reps},
"outputs": {"out": []},
}
- self.rtlsim_multi_io(sim, io_dict)
- output = io_dict["outputs"]["out"]
else:
- output = self.rtlsim(sim, inp)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..a0c61ec5b3 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -42,6 +42,11 @@
except ModuleNotFoundError:
PyVerilator = None
+try:
+ import pyxsi_utils
+except ModuleNotFoundError:
+ pyxsi_utils = None
+
class HLSBackend(ABC):
"""HLSBackend class all custom ops that correspond to a finn-hlslib
@@ -54,6 +59,8 @@ def get_nodeattr_types(self):
"code_gen_dir_cppsim": ("s", False, ""),
"executable_path": ("s", False, ""),
"res_hls": ("s", False, ""),
+ # temporary node attribute to keep track of interface style of hls ops
+ "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}),
}
def get_all_verilog_paths(self):
@@ -65,8 +72,15 @@ def get_all_verilog_paths(self):
), """Node attribute "code_gen_dir_ipgen" is
not set. Please run HLSSynthIP first."""
verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name)
- # default impl only returns the HLS verilog codegen dir
- return [verilog_path]
+ subcore_verilog_path = "{}/project_{}/sol1/impl/ip/hdl/ip/".format(
+ code_gen_dir, self.onnx_node.name
+ )
+ # default impl only returns the HLS verilog codegen dir and subcore (impl/ip/hdl/ip) dir
+ # if it exists
+ ret = [verilog_path]
+ if os.path.isdir(subcore_verilog_path):
+ ret += [subcore_verilog_path]
+ return ret
def get_all_verilog_filenames(self, abspath=False):
"Return list of all Verilog files used for this node."
@@ -87,25 +101,39 @@ def prepare_rtlsim(self):
for this node, sets the rtlsim_so attribute to its path and returns
a PyVerilator wrapper around it."""
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
-
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
verilog_files = self.get_all_verilog_filenames(abspath=True)
single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
- tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
- target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
- make_single_source_file(verilog_files, target_file)
-
- # build the Verilator emu library
- sim = PyVerilator.build(
- self.get_verilog_top_module_name() + ".v",
- build_dir=tmp_build_dir,
- verilog_path=[single_src_dir],
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
+ if rtlsim_backend == "pyverilator":
+ if PyVerilator is None:
+ raise ImportError("Installation of PyVerilator is required.")
+ tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+ target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
+ make_single_source_file(verilog_files, target_file)
+
+ # build the Verilator emu library
+ sim = PyVerilator.build(
+ self.get_verilog_top_module_name() + ".v",
+ build_dir=tmp_build_dir,
+ verilog_path=[single_src_dir],
+ trace_depth=get_rtlsim_trace_depth(),
+ top_module_name=self.get_verilog_top_module_name(),
+ )
+ # save generated lib filename in attribute
+ self.set_nodeattr("rtlsim_so", sim.lib._name)
+ elif rtlsim_backend == "pyxsi":
+ ret = pyxsi_utils.compile_sim_obj(
+ self.get_verilog_top_module_name(), verilog_files, single_src_dir
+ )
+ # save generated lib filename in attribute
+ self.set_nodeattr("rtlsim_so", ret[0] + "/" + ret[1])
+ # TODO return val of this function is never used
+ # refactor s.t. it does not return anything at all,
+ # consistently between pyverilator and pyxsi
+ sim = None
+ else:
+ assert False, "Unknown rtlsim_backend"
+
return sim
def code_generation_ipgen(self, model, fpgapart, clk):
@@ -206,7 +234,13 @@ def code_generation_cppsim(self, model):
self.dataoutstrm()
self.save_as_npy()
- template = templates.docompute_template
+ if self.get_nodeattr("cpp_interface") == "hls_vector":
+ self.timeout_value()
+ self.timeout_condition()
+ self.timeout_read_stream()
+ template = templates.docompute_template_timeout
+ else:
+ template = templates.docompute_template
for key in self.code_gen_dict:
# transform list into long string separated by '\n'
@@ -236,6 +270,7 @@ def compile_singlenode_code(self):
builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
builder.append_includes("-I$FINN_ROOT/custom_hls")
builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+ builder.append_includes("-I{}/include".format(os.environ["VITIS_PATH"]))
builder.append_includes("--std=c++14")
builder.append_includes("-O3")
builder.append_sources(code_gen_dir + "/*.cpp")
@@ -371,24 +406,40 @@ def read_npy_data(self):
if dtype == DataType["BIPOLAR"]:
# use binary for bipolar storage
dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
elem_hls_type = dtype.get_hls_datatype_str()
npy_type = "float"
npy_in = "%s/input_0.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- npy_in,
- self.hls_sname(),
+
+ cpp_interface = self.get_nodeattr("cpp_interface")
+
+ if cpp_interface == "packed":
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+ else:
+ folded_shape = self.get_folded_input_shape()
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2vectorstream<%s, %s, %d>("%s", in0_%s, false);'
+ % (
+ elem_hls_type,
+ npy_type,
+ folded_shape[-1],
+ npy_in,
+ self.hls_sname(),
+ )
)
- )
def strm_decl(self):
"""Function to generate the commands for the stream declaration in c++,
@@ -422,27 +473,43 @@ def dataoutstrm(self):
if dtype == DataType["BIPOLAR"]:
# use binary for bipolar storage
dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
elem_hls_type = dtype.get_hls_datatype_str()
npy_type = "float"
npy_out = "%s/output.npy" % code_gen_dir
oshape = self.get_folded_output_shape()
oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- self.hls_sname(),
- oshape_cpp_str,
- npy_out,
- )
- ]
+ cpp_interface = self.get_nodeattr("cpp_interface")
+
+ if cpp_interface == "packed":
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_outstream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+
+ self.code_gen_dict["$DATAOUTSTREAM$"] = [
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ self.hls_sname(),
+ oshape_cpp_str,
+ npy_out,
+ )
+ ]
+ else:
+ folded_shape = self.get_folded_output_shape()
+ self.code_gen_dict["$DATAOUTSTREAM$"] = [
+ 'vectorstream2npy<%s, %s, %d>(strm, %s, "%s");'
+ % (
+ elem_hls_type,
+ npy_type,
+ folded_shape[-1],
+ oshape_cpp_str,
+ npy_out,
+ )
+ ]
def save_as_npy(self):
"""Function to generate the commands for saving data in .npy file in c++"""
@@ -474,3 +541,17 @@ def get_ap_int_max_w(self):
ret = max([instream, outstream])
assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
return ret
+
+ def timeout_value(self):
+ """Set timeout value for HLS functions defined for one clock cycle"""
+ self.code_gen_dict["$TIMEOUT_VALUE$"] = ["1000"]
+
+ def timeout_condition(self):
+ """Set timeout condition for HLS functions defined for one clock cycle"""
+ self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+ def timeout_read_stream(self):
+ """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+ self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+ "strm << out_{}.read();".format(self.hls_sname())
+ ]
diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index b40b8f3074..ad3e9cc514 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -41,6 +41,11 @@
except ModuleNotFoundError:
PyVerilator = None
+try:
+ import pyxsi_utils
+except ModuleNotFoundError:
+ pyxsi_utils = None
+
class HWCustomOp(CustomOp):
"""HWCustomOp class all custom ops that can be implemented with either
@@ -67,6 +72,7 @@ def get_nodeattr_types(self):
"res_estimate": ("s", False, ""),
"res_synth": ("s", False, ""),
"rtlsim_so": ("s", False, ""),
+ "rtlsim_backend": ("s", False, "pyxsi", {"pyverilator", "pyxsi"}),
# partitioning info
# ID of SLR to which the Op is attached in Vitis builds
# Set to -1 as 'don't care'
@@ -132,10 +138,36 @@ def get_rtlsim(self):
rtlsim_so = self.get_nodeattr("rtlsim_so")
assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library."
- # create PyVerilator wrapper
- sim = PyVerilator(rtlsim_so)
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
+
+ if rtlsim_backend == "pyverilator":
+ # create PyVerilator wrapper
+ sim = PyVerilator(rtlsim_so)
+ elif rtlsim_backend == "pyxsi":
+ sim_base, sim_rel = rtlsim_so.split("xsim.dir")
+ sim_rel = "xsim.dir" + sim_rel
+ # pass in correct tracefile from attribute
+ tracefile = self.get_nodeattr("rtlsim_trace")
+ if tracefile == "default":
+ tracefile = self.onnx_node.name + ".wdb"
+ sim = pyxsi_utils.load_sim_obj(sim_base, sim_rel, tracefile)
+ else:
+ assert False, "Unknown rtlsim_backend"
+
return sim
+ def close_rtlsim(self, sim):
+ "Close and free up resources for rtlsim."
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
+
+ if rtlsim_backend == "pyverilator":
+ # no action needed
+ pass
+ elif rtlsim_backend == "pyxsi":
+ pyxsi_utils.close_rtlsim(sim)
+ else:
+ assert False, "Unknown rtlsim_backend"
+
def node_res_estimation(self, fpgapart):
"""Returns summarized resource estimation of BRAMs and LUTs
of the node as a dictionary."""
@@ -194,114 +226,57 @@ def get_op_and_param_counts(self):
def reset_rtlsim(self, sim):
"""Sets reset input in pyverilator to zero, toggles the clock and set it
back to one"""
- sim.io.ap_rst_n = 0
- sim.io.ap_clk = 1
- sim.io.ap_clk = 0
- sim.io.ap_rst_n = 1
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
+ if rtlsim_backend == "pyverilator":
+ sim.io.ap_rst_n = 0
+ sim.io.ap_clk = 1
+ sim.io.ap_clk = 0
+ sim.io.ap_rst_n = 1
+ elif rtlsim_backend == "pyxsi":
+ pyxsi_utils.reset_rtlsim(sim)
+ else:
+ assert False, f"Unknown rtlsim_backend {rtlsim_backend}"
def toggle_clk(self, sim):
"""Toggles the clock input in pyverilator once."""
- sim.io.ap_clk = 1
- sim.io.ap_clk = 0
-
- def rtlsim(self, sim, inp, inp2=None):
- """Runs the pyverilator simulation by passing the input values to the simulation,
- toggle the clock and observing the execution time. Function contains also an
- observation loop that can abort the simulation if no output value is produced
- after 100 cycles."""
-
- trace_file = self.get_nodeattr("rtlsim_trace")
- if trace_file != "":
- if trace_file == "default":
- trace_file = self.onnx_node.name + ".vcd"
- sim.start_vcd_trace(trace_file)
- inputs = inp
- outputs = []
- sname = self.hls_sname()
- o_ready = "out_" + sname + "_TREADY"
- o_valid = "out_" + sname + "_TVALID"
- o_data = "out_" + sname + "_TDATA"
- in0_ready = "in0_" + sname + "_TREADY"
- in0_valid = "in0_" + sname + "_TVALID"
- in0_data = "in0_" + sname + "_TDATA"
- in1_ready = "in1_" + sname + "_TREADY"
- in1_valid = "in1_" + sname + "_TVALID"
- in1_data = "in1_" + sname + "_TDATA"
-
- sim.io[o_ready] = 1
-
- # observe if output is completely calculated
- # observation_count will contain the number of cycles the calculation ran
- num_out_values = self.get_number_output_values()
- output_observed = False
- observation_count = 0
-
- # avoid infinite looping of simulation by aborting when there is no change in
- # output values after 100 cycles
- no_change_count = 0
- old_outputs = outputs
- liveness_threshold = pyverilate_get_liveness_threshold_cycles()
-
- while not (output_observed):
- sim.io[in0_valid] = 1 if len(inputs) > 0 else 0
- sim.io[in0_data] = inputs[0] if len(inputs) > 0 else 0
- if sim.io[in0_ready] == 1 and sim.io[in0_valid] == 1:
- inputs = inputs[1:]
-
- if inp2 is not None:
- sim.io[in1_valid] = 1 if len(inp2) > 0 else 0
- sim.io[in1_data] = inp2[0] if len(inp2) > 0 else 0
- if sim.io[in1_ready] == 1 and sim.io[in1_valid] == 1:
- inp2 = inp2[1:]
-
- if sim.io[o_valid] == 1 and sim.io[o_ready] == 1:
- outputs = outputs + [sim.io[o_data]]
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
+ if rtlsim_backend == "pyverilator":
sim.io.ap_clk = 1
sim.io.ap_clk = 0
-
- observation_count = observation_count + 1
- no_change_count = no_change_count + 1
-
- if len(outputs) == num_out_values:
- self.set_nodeattr("cycles_rtlsim", observation_count)
- output_observed = True
-
- if no_change_count == liveness_threshold:
- if old_outputs == outputs:
- if trace_file != "":
- sim.flush_vcd_trace()
- sim.stop_vcd_trace()
- raise Exception(
- "Error in simulation! Takes too long to produce output. "
- "Consider setting the LIVENESS_THRESHOLD env.var. to a "
- "larger value."
- )
- else:
- no_change_count = 0
- old_outputs = outputs
- if trace_file != "":
- sim.flush_vcd_trace()
- sim.stop_vcd_trace()
- return outputs
+ elif rtlsim_backend == "pyxsi":
+ pyxsi_utils.toggle_clk(sim)
+ else:
+ assert False, f"Unknown rtlsim_backend {rtlsim_backend}"
def rtlsim_multi_io(self, sim, io_dict):
"Run rtlsim for this node, supports multiple i/o streams."
-
- # signal name
+ # signal name suffix
sname = "_" + self.hls_sname() + "_"
-
- trace_file = self.get_nodeattr("rtlsim_trace")
- if trace_file == "default":
- trace_file = self.onnx_node.name + ".vcd"
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
num_out_values = self.get_number_output_values()
- total_cycle_count = rtlsim_multi_io(
- sim,
- io_dict,
- num_out_values,
- trace_file=trace_file,
- sname=sname,
- liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
- )
+ if rtlsim_backend == "pyverilator":
+ trace_file = self.get_nodeattr("rtlsim_trace")
+ if trace_file == "default":
+ trace_file = self.onnx_node.name + ".vcd"
+ total_cycle_count = rtlsim_multi_io(
+ sim,
+ io_dict,
+ num_out_values,
+ trace_file=trace_file,
+ sname=sname,
+ liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+ )
+ elif rtlsim_backend == "pyxsi":
+ total_cycle_count = pyxsi_utils.rtlsim_multi_io(
+ sim,
+ io_dict,
+ num_out_values,
+ sname=sname,
+ liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+ )
+ else:
+ assert False, f"Unknown rtlsim_backend {rtlsim_backend}"
+
self.set_nodeattr("cycles_rtlsim", total_cycle_count)
def generate_params(self, model, path):
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index bbe5b850b1..bd59f94892 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -25,10 +25,10 @@
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
import math
import numpy as np
import onnx.numpy_helper as np_helper
+import os
import qonnx.custom_op.general.xnorpopcount as xp
import textwrap
import warnings
@@ -124,6 +124,7 @@ def get_nodeattr_types(self):
# vector through the accelerator. This will get rid of any old
# weight data from the weight FIFOs.
"runtime_writeable_weights": ("i", False, 0, {0, 1}),
+ "pumpedMemory": ("i", False, 0, {0, 1}),
}
my_attrs.update(super().get_nodeattr_types())
return my_attrs
@@ -724,6 +725,15 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
# add zeroes to pad out file to 1024 entries
weight_stream = weight_tensor_pe_flipped.flatten()
weight_stream = weight_stream.copy()
+ if self.get_nodeattr("pumpedMemory"):
+ split_w_stream = np.zeros([weight_stream.shape[0] * 2], dtype=object)
+ k = 0
+ for i in range(len(weight_stream)):
+ weight = weight_stream[i]
+ split_w_stream[k] = weight[len(weight) // 2 :]
+ split_w_stream[k + 1] = weight[: len(weight) // 2]
+ k += 2
+ weight_stream = split_w_stream
with open(weight_file_name, "w") as f:
for val in weight_stream:
f.write(val + "\n")
@@ -868,6 +878,14 @@ def derive_characteristic_fxns(self, period):
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
+ try:
+ pumped_compute = self.get_nodeattr("pumpedCompute")
+ except AttributeError:
+ pumped_compute = 0
+
+ if pumped_compute or self.get_nodeattr("pumpedMemory"):
+ intf_names["clk2x"] = ["ap_clk2x"]
+
mem_mode = self.get_nodeattr("mem_mode")
sname = self.hls_sname()
if mem_mode == "external":
@@ -879,16 +897,50 @@ def get_verilog_top_module_intf_names(self):
intf_names["axilite"] = ["s_axilite"]
return intf_names
+ def generate_hdl_memstream(self):
+ template_path = (
+ os.environ["FINN_ROOT"] + "/finn-rtllib/memstream/hdl/memstream_wrapper_template.v"
+ )
+ mname = self.onnx_node.name
+ wmem = self.calc_wmem()
+ padded_width = self.get_weightstream_width_padded()
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+ code_gen_dict = {
+ "$MODULE_NAME$": [mname],
+ "$DEPTH$": [str(wmem)],
+ "$WIDTH$": [str(padded_width)],
+ "$INIT_FILE$": [
+ self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
+ ],
+ "$RAM_STYLE$": [self.get_nodeattr("ram_style")],
+ "$PUMPED_MEMORY$": [str(self.get_nodeattr("pumpedMemory"))],
+ }
+ # apply code generation to template
+ with open(template_path, "r") as f:
+ template_wrapper = f.read()
+ for key in code_gen_dict:
+ # transform list into long string separated by '\n'
+ code_gen_line = "\n".join(code_gen_dict[key])
+ template_wrapper = template_wrapper.replace(key, code_gen_line)
+ with open(
+ os.path.join(code_gen_dir, mname + "_memstream_wrapper.v"),
+ "w",
+ ) as f:
+ f.write(template_wrapper)
+
def code_generation_ipi(self):
- cmd = []
+ source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name
+ cmd = ["file mkdir %s" % source_target]
# add streamer if needed
mem_mode = self.get_nodeattr("mem_mode")
if mem_mode == "internal_decoupled":
+ self.generate_hdl_memstream()
runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
- if self.get_nodeattr("ram_style") == "ultra":
- assert (
- runtime_writable == 1
- ), "Layer with URAM weights must have runtime_writeable_weights=1"
+ # if self.get_nodeattr("ram_style") == "ultra":
+ # assert (
+ # runtime_writable == 1
+ # ), "Layer with URAM weights must have runtime_writeable_weights=1"
node_name = self.onnx_node.name
sname = self.hls_sname()
# create a hierarchy for this layer, with the same port names
@@ -898,6 +950,17 @@ def code_generation_ipi(self):
din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0]
cmd.append("create_bd_cell -type hier %s" % node_name)
cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+ # if we need a 2x clock for either compute or memory, instantiate the 2x clk port
+ try:
+ pumped_compute = self.get_nodeattr("pumpedCompute")
+ except AttributeError:
+ pumped_compute = 0
+
+ if pumped_compute or self.get_nodeattr("pumpedMemory"):
+ clk2x_name = self.get_verilog_top_module_intf_names()["clk2x"][0]
+ cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk2x_name))
+ else:
+ clk2x_name = None
cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
cmd.append(
"create_bd_intf_pin -mode Master "
@@ -907,31 +970,28 @@ def code_generation_ipi(self):
"create_bd_intf_pin -mode Slave "
"-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
)
+ # instantiate the RTL block
# Instantiate either the HLS or RTL IP depending on operator
self.instantiate_ip(cmd)
-
- # instantiate a streamer and connect it to the HLS IP
- strm_vlnv = "amd.com:finn:memstream:1.0"
+ # instantiate a streamer and connect it to the IP
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ swg_rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/memstream/hdl/")
+ strm_tmpl_name = node_name + "_memstream_wrapper"
+ sourcefiles = [
+ os.path.join(code_gen_dir, strm_tmpl_name + ".v"),
+ swg_rtllib_dir + "axilite_if.v",
+ swg_rtllib_dir + "memstream_axi.sv",
+ swg_rtllib_dir + "memstream.sv",
+ ]
+ for f in sourcefiles:
+ cmd += ["add_files -copy_to %s -norecurse %s" % (source_target, f)]
strm_inst = node_name + "_wstrm"
+
cmd.append(
- "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
- )
- cmd.append(
- "set_property -dict [list "
- "CONFIG.DEPTH {%d} "
- "CONFIG.WIDTH {%d} "
- "CONFIG.INIT_FILE {%s} "
- "CONFIG.RAM_STYLE {%s} "
- "] [get_bd_cells /%s/%s]"
- % (
- self.calc_wmem(),
- self.get_weightstream_width_padded(),
- self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
- self.get_nodeattr("ram_style"),
- node_name,
- strm_inst,
- )
+ "create_bd_cell -type hier -reference %s /%s/%s"
+ % (strm_tmpl_name, node_name, strm_inst)
)
+
cmd.append(
"connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
"[get_bd_intf_pins %s/%s/weights_%s]"
@@ -945,6 +1005,18 @@ def code_generation_ipi(self):
"connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
% (node_name, clk_name, node_name, strm_inst)
)
+ # if using 2x pumped memory, connect the memstreamer's 2x clk input
+ # to the 2x clock port. otherwise connect it to the regular clock port.
+ if self.get_nodeattr("pumpedMemory"):
+ cmd.append(
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk2x]"
+ % (node_name, clk2x_name, node_name, strm_inst)
+ )
+ else:
+ cmd.append(
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk2x]"
+ % (node_name, clk_name, node_name, strm_inst)
+ )
cmd.append(
"connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
% (node_name, rst_name, node_name, node_name, rst_name)
diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py
index 321522e7ba..3c063c00d9 100755
--- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py
@@ -40,14 +40,8 @@
ConvolutionInputGenerator,
)
from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-try:
- from pyverilator import PyVerilator
-except ModuleNotFoundError:
- PyVerilator = None
-
# RTL Convolution Input Generator / Sliding Window Generator (SWG)
# Matches and extends the functionality of all ConvolutionInputGenerator_* functions
# in finn-hlslib by generating HDL code for two different implementation styles:
@@ -336,8 +330,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -932,37 +933,23 @@ def generate_hdl(self, model, fpgapart, clk):
self.set_nodeattr("ipgen_path", code_gen_dir)
self.set_nodeattr("ip_path", code_gen_dir)
- def prepare_rtlsim(self):
- """Creates a Verilator emulation library for the RTL code generated
- for this node, sets the rtlsim_so attribute to its path and returns
- a PyVerilator wrapper around it."""
- # Modified to use generated (System-)Verilog instead of HLS output products
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
-
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- verilog_paths = [code_gen_dir]
+ def get_rtl_file_list(self, abspath=False):
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/swg/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
verilog_files = [
- "swg_pkg.sv",
- self.get_nodeattr("gen_top_module") + "_wrapper.v",
- self.get_nodeattr("gen_top_module") + "_impl.sv",
- "swg_common.sv",
+ rtllib_dir + "swg_pkg.sv",
+ code_gen_dir + self.get_nodeattr("gen_top_module") + "_wrapper.v",
+ code_gen_dir + self.get_nodeattr("gen_top_module") + "_impl.sv",
+ rtllib_dir + "swg_common.sv",
]
if self.get_nodeattr("dynamic_mode"):
- verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v")
-
- # build the Verilator emu library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
- return sim
+ verilog_files.append(code_gen_dir + self.get_nodeattr("gen_top_module") + "_axilite.v")
+
+ return verilog_files
def code_generation_ipi(self):
"""Constructs and returns the TCL for node instantiation in Vivado IPI."""
diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py
index cc49446ea3..6ee1e27e2d 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py
@@ -34,14 +34,8 @@
from finn.custom_op.fpgadataflow.fmpadding import FMPadding
from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-try:
- from pyverilator import PyVerilator
-except ModuleNotFoundError:
- PyVerilator = None
-
class FMPadding_rtl(FMPadding, RTLBackend):
"""CustomOp wrapper for the finn-rtllib fmpadding_axi component
@@ -96,8 +90,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -206,35 +207,21 @@ def generate_hdl(self, model, fpgapart, clk):
self.set_nodeattr("ipgen_path", code_gen_dir)
self.set_nodeattr("ip_path", code_gen_dir)
- def prepare_rtlsim(self):
- """Creates a Verilator emulation library for the RTL code generated
- for this node, sets the rtlsim_so attribute to its path and returns
- a PyVerilator wrapper around it."""
- # Modified to use generated (System-)Verilog instead of HLS output products
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
+ def get_rtl_file_list(self, abspath=False):
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/fmpadding/hdl/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- verilog_paths = [code_gen_dir]
verilog_files = [
- "fmpadding_axi.sv",
- "fmpadding.sv",
- "axi2we.sv",
- self.get_nodeattr("gen_top_module") + ".v",
+ rtllib_dir + "fmpadding_axi.sv",
+ rtllib_dir + "fmpadding.sv",
+ rtllib_dir + "axi2we.sv",
+ code_gen_dir + self.get_nodeattr("gen_top_module") + ".v",
]
-
- # build the Verilator emu library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
- return sim
+ return verilog_files
def code_generation_ipi(self):
"""Constructs and returns the TCL for node instantiation in Vivado IPI."""
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index d9ab501117..c072fb28b3 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -28,11 +28,10 @@
import numpy as np
import os
-from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
-from finn.util.basic import get_dsp_block, get_rtlsim_trace_depth, make_build_dir
+from finn.util.basic import get_dsp_block
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
try:
@@ -55,7 +54,10 @@ def __init__(self, onnx_node, **kwargs):
super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
- my_attrs = {}
+ my_attrs = {
+ # Double-pumped DSPs enabled
+ "pumpedCompute": ("i", False, 0, {0, 1}),
+ }
my_attrs.update(MVAU.get_nodeattr_types(self))
my_attrs.update(RTLBackend.get_nodeattr_types(self))
return my_attrs
@@ -91,12 +93,12 @@ def execute_node(self, context, graph):
elif in_ind > 1:
raise Exception("Unexpected input found for MatrixVectorActivation_rtl")
in_ind += 1
-
sim = self.get_rtlsim()
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
- reset_rtlsim(sim)
- toggle_clk(sim)
+ super().reset_rtlsim(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
if mem_mode in ["external", "internal_decoupled"]:
wnbits = self.get_weightstream_width()
export_wdt = self.get_weight_datatype()
@@ -108,10 +110,14 @@ def execute_node(self, context, graph):
"inputs": {"in0": inp, "weights": wei * num_w_reps},
"outputs": {"out": []},
}
- self.rtlsim_multi_io(sim, io_dict)
- output = io_dict["outputs"]["out"]
else:
- output = self.rtlsim(sim, inp)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -147,6 +153,7 @@ def dsp_estimation(self, fpgapart):
def instantiate_ip(self, cmd):
# instantiate the RTL IP
+ node_name = self.onnx_node.name
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
sourcefiles = [
@@ -165,8 +172,8 @@ def instantiate_ip(self, cmd):
"create_bd_cell -type hier -reference %s /%s/%s"
% (
self.get_nodeattr("gen_top_module"),
- self.onnx_node.name,
- self.onnx_node.name,
+ node_name,
+ node_name,
)
)
else:
@@ -174,23 +181,44 @@ def instantiate_ip(self, cmd):
"create_bd_cell -type hier -reference %s %s"
% (
self.get_nodeattr("gen_top_module"),
- self.onnx_node.name,
+ node_name,
)
)
+ # if using 2x pumped compute, connect the MVU's 2x clk input
+ # to the 2x clock port. Otherwise connect 2x clk to regular clk port
+ clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+ if self.get_nodeattr("pumpedCompute") or self.get_nodeattr("pumpedMemory"):
+ clk2x_name = self.get_verilog_top_module_intf_names()["clk2x"][0]
+ cmd.append(
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+ % (node_name, clk2x_name, node_name, node_name, clk2x_name)
+ )
+ else:
+ cmd.append(
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk2x]"
+ % (node_name, clk_name, node_name, node_name)
+ )
def _resolve_segment_len(self, clk):
# Insert pipeline registers in the DSP58 chain to meet target clock frequency
# ~0.741 ns seems the worst-case delay through first DSP
# ~0.605 ns seems to be (on average) delay for all subsequent DSPs
# clk >= (critical_path_dsps - 1) * 0.605 + 0.741
+ if self.get_nodeattr("pumpedCompute"):
+ ref_clk = clk / 2
+ simd_factor = 6
+ else:
+ ref_clk = clk
+ simd_factor = 3
+
assert (
- clk > 0.741
+ ref_clk > 0.741
), """Infeasible clk target of {} ns has been set,
consider lowering the targeted clock frequency!""".format(
- clk
+ ref_clk
)
- critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1)
- max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3)
+ critical_path_dsps = np.floor((ref_clk - 0.741) / 0.605 + 1)
+ max_chain_len = np.ceil(self.get_nodeattr("SIMD") / simd_factor)
dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len
return dsp_chain_len
@@ -249,7 +277,7 @@ def generate_hdl(self, model, fpgapart, clk):
os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"),
"w",
) as f:
- f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0)))
+ f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1)))
with open(
os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"),
"w",
@@ -268,6 +296,7 @@ def prepare_codegen_default(self, fpgapart, clk):
code_gen_dict = {}
code_gen_dict["$IS_MVU$"] = [str(1)]
code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(dsp_block)]
+ code_gen_dict["$PUMPED_COMPUTE$"] = [str(self.get_nodeattr("pumpedCompute"))]
code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))]
code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))]
code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))]
@@ -282,28 +311,24 @@ def prepare_codegen_default(self, fpgapart, clk):
return template_path, code_gen_dict
- def prepare_rtlsim(self):
- """Creates a Verilator emulation library for the RTL code generated
- for this node, sets the rtlsim_so attribute to its path and returns
- a PyVerilator wrapper around it."""
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
-
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- # Path to (System-)Verilog files used by top-module & path to top-module
- verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
- verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
-
- # build the Verilator emu library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
+ def get_rtl_file_list(self, abspath=False):
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
+ verilog_files = [
+ code_gen_dir + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v",
+ rtllib_dir + "mvu_vvu_axi.sv",
+ rtllib_dir + "replay_buffer.sv",
+ rtllib_dir + "mvu_4sx4u.sv",
+ rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+ rtllib_dir + "mvu_8sx8u_dsp48.sv",
+ ]
+ return verilog_files
- return sim
+ def get_verilog_paths(self):
+ verilog_paths = super().get_verilog_paths()
+ verilog_paths.append(os.environ["FINN_ROOT"] + "/finn-rtllib/mvu")
+ return verilog_paths
diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py
index e79782eb6d..496e38acfc 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py
@@ -34,14 +34,8 @@
from finn.custom_op.fpgadataflow.streamingdatawidthconverter import (
StreamingDataWidthConverter,
)
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-try:
- from pyverilator import PyVerilator
-except ModuleNotFoundError:
- PyVerilator = None
-
class StreamingDataWidthConverter_rtl(StreamingDataWidthConverter, RTLBackend):
"""Class that corresponds to finn-rtllib datawidth converter
@@ -100,8 +94,15 @@ def execute_node(self, context, graph):
"{}/input_0.npy".format(code_gen_dir), export_idt, nbits
)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
odt = export_idt
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -167,34 +168,21 @@ def generate_hdl(self, model, fpgapart, clk):
self.set_nodeattr("ipgen_path", code_gen_dir)
self.set_nodeattr("ip_path", code_gen_dir)
- def prepare_rtlsim(self):
- """Creates a Verilator emulation library for the RTL code generated
- for this node, sets the rtlsim_so attribute to its path and returns
- a PyVerilator wrapper around it."""
- # Modified to use generated (System-)Verilog instead of HLS output products
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
+ def get_rtl_file_list(self, abspath=False):
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/dwc/hdl/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- verilog_paths = [code_gen_dir]
verilog_files = [
- "dwc_axi.sv",
- "dwc.sv",
- self.get_nodeattr("gen_top_module") + ".v",
+ rtllib_dir + "dwc_axi.sv",
+ rtllib_dir + "dwc.sv",
+ code_gen_dir + self.get_nodeattr("gen_top_module") + ".v",
]
- # build the Verilator emu library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
- return sim
+ return verilog_files
def code_generation_ipi(self):
"""Constructs and returns the TCL for node instantiation in Vivado IPI."""
diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
index f8f27cb647..05b45f9e4b 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py
@@ -33,14 +33,8 @@
from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
-from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-try:
- from pyverilator import PyVerilator
-except ModuleNotFoundError:
- PyVerilator = None
-
class StreamingFIFO_rtl(StreamingFIFO, RTLBackend):
def __init__(self, onnx_node, **kwargs):
@@ -152,8 +146,15 @@ def execute_node(self, context, graph):
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- output = self.rtlsim(sim, inp)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = DataType[self.get_nodeattr("dataType")]
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -254,30 +255,23 @@ def code_generation_ipi(self):
"FIFO implementation style %s not supported, please use rtl or vivado" % impl_style
)
+ def get_rtl_file_list(self, abspath=False):
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/fifo/hdl/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
+
+ verilog_files = [
+ rtllib_dir + "Q_srl.v",
+ code_gen_dir + self.get_nodeattr("gen_top_module") + ".v",
+ ]
+ return verilog_files
+
def prepare_rtlsim(self):
assert self.get_nodeattr("impl_style") != "vivado", (
"StreamingFIFO impl_style "
"cannot be vivado for rtlsim. Only impl_style=rtl supported."
)
- # Modified to use generated (System-)Verilog instead of HLS output products
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
-
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- verilog_paths = [code_gen_dir]
- verilog_files = [
- "Q_srl.v",
- self.get_nodeattr("gen_top_module") + ".v",
- ]
- # build the Verilator emu library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
- return sim
+ return super().prepare_rtlsim()
diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
index d1e9387b1b..4f35ffd94c 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
@@ -30,19 +30,12 @@
import numpy as np
import os
import shutil
-from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io
from qonnx.core.datatype import DataType
from qonnx.util.basic import roundup_to_integer_multiple
from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
from finn.custom_op.fpgadataflow.thresholding import Thresholding
-from finn.util.basic import (
- get_memutil_alternatives,
- get_rtlsim_trace_depth,
- make_build_dir,
- mem_primitives_versal,
- pyverilate_get_liveness_threshold_cycles,
-)
+from finn.util.basic import get_memutil_alternatives, mem_primitives_versal
from finn.util.data_packing import (
npy_to_rtlsim_input,
pack_innermost_dim_as_hex_string,
@@ -245,9 +238,7 @@ def prepare_codegen_rtl_values(self, model):
code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name]
# Identify the module name
- code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
- self.get_verilog_top_module_name() + "_axi_wrapper"
- ]
+ code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()]
# Set the top module name - AXI wrapper
code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"]
@@ -269,6 +260,12 @@ def prepare_codegen_rtl_values(self, model):
code_gen_dict["$SIGNED$"] = [str(1)]
else:
code_gen_dict["$SIGNED$"] = [str(0)]
+ # Is the input datatype non-integer?
+ # (assume this means floating-point)
+ if self.get_input_datatype().is_integer():
+ code_gen_dict["$FPARG$"] = [str(0)]
+ else:
+ code_gen_dict["$FPARG$"] = [str(1)]
if bias >= 0:
o_bits = math.ceil(math.log2(2**o_bitwidth + bias))
@@ -289,46 +286,22 @@ def prepare_codegen_rtl_values(self, model):
code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)]
return code_gen_dict
- def get_rtl_file_list(self):
+ def get_rtl_file_list(self, abspath=False):
"""Thresholding binary search RTL file list"""
- return [
- "axilite_if.v",
- "thresholding.sv",
- "thresholding_axi.sv",
- "thresholding_template_wrapper.v",
- ]
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/thresholding/hdl/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
- def get_rtl_file_paths(self):
- """Get full path of all RTL files"""
- rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/"
- rtl_file_list = self.get_rtl_file_list()
- rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list]
- return rtl_file_paths
-
- def get_rtl_template_data(self, path):
- """Return RTL file contents as a template"""
- with open(path, "r") as f:
- template = f.read()
- return template
-
- def fill_in_rtl_template_data(self, replace_dict, template_data):
- """Use attribute values to finn in RTL template placeholders"""
- template_data_cp = template_data
- for key in replace_dict:
- replacement_line = "\n".join(replace_dict[key])
- template_data_cp = template_data_cp.replace(key, replacement_line)
- return template_data_cp
-
- def dump_rtl_data(self, dest_dir, filename, data):
- """Dump filled-in-template RTL files for future synthesis step"""
- # when generating template files, handle a special case:
- # if the filename contains the word "template", replace that
- # with the node name to distinguish between instances
- if "template" in filename:
- filename = self.get_nodeattr("gen_top_module") + ".v"
- with open(os.path.join(dest_dir, filename), "w") as f:
- f.write(data)
- return
+ verilog_files = [
+ rtllib_dir + "axilite_if.v",
+ rtllib_dir + "thresholding.sv",
+ rtllib_dir + "thresholding_axi.sv",
+ code_gen_dir + self.get_nodeattr("gen_top_module") + ".v",
+ ]
+ return verilog_files
def generate_hdl(self, model, fpgapart, clk):
"""Prepare HDL files from templates for synthesis"""
@@ -342,14 +315,23 @@ def generate_hdl(self, model, fpgapart, clk):
# by PyVerilator and IPI generation
self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0])
- for rtl_file_path in self.get_rtl_file_paths():
- # read in original RTL template file
- template_data = self.get_rtl_template_data(rtl_file_path)
- # apply code generation to templates
- data = self.fill_in_rtl_template_data(code_gen_dict, template_data)
- # dump filled-in template to destination directory for compilation
- file_only_path = rtl_file_path.split("/")[-1]
- self.dump_rtl_data(code_gen_dir, file_only_path, data)
+ rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl"
+ template_path = rtlsrc + "/thresholding_template_wrapper.v"
+ with open(template_path, "r") as f:
+ template_wrapper = f.read()
+ for key in code_gen_dict:
+ # transform list into long string separated by '\n'
+ code_gen_line = "\n".join(code_gen_dict[key])
+ template_wrapper = template_wrapper.replace(key, code_gen_line)
+ with open(
+ os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + ".v"),
+ "w",
+ ) as f:
+ f.write(template_wrapper)
+
+ sv_files = ["axilite_if.v", "thresholding.sv", "thresholding_axi.sv"]
+ for sv_file in sv_files:
+ shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir)
# set ipgen_path and ip_path so that HLS-Synth transformation
# and stich_ip transformation do not complain
@@ -358,39 +340,6 @@ def generate_hdl(self, model, fpgapart, clk):
self.set_nodeattr("ip_path", code_gen_dir)
return
- def prepare_rtlsim(self):
- """Creates a Verilator emulation library for the RTL code generated
- for this node, sets the rtlsim_so attribute to its path and returns
- a PyVerilator wrapper around it."""
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
-
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- verilog_paths = [code_gen_dir]
- verilog_files = [
- x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module"))
- for x in self.get_rtl_file_list()
- ]
- dat_files = self.get_all_meminit_filenames(abspath=True)
- single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
- for dat_file in dat_files:
- shutil.copy(dat_file, single_src_dir)
-
- # build the Verilator emulation library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=single_src_dir,
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_nodeattr("gen_top_module"),
- auto_eval=False,
- )
-
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
- return sim
-
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -404,10 +353,11 @@ def execute_node(self, context, graph):
# it is assumed that the first input of the node is the data input
# the second input are the thresholds
if in_ind == 0:
- assert (
- str(context[inputs].dtype) == "float32"
- ), """Input datatype is
- not float32 as expected."""
+ assert str(context[inputs].dtype) in [
+ "float32",
+ "float16",
+ ], """Input datatype is
+ not float32 or float16 as expected."""
expected_inp_shape = self.get_folded_input_shape()
reshaped_input = context[inputs].reshape(expected_inp_shape)
@@ -431,38 +381,23 @@ def execute_node(self, context, graph):
# Create a PyVerilator wrapper of the RTLSim .so
sim = self.get_rtlsim()
nbits = self.get_instream_width()
- inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
- io_names = self.get_verilog_top_module_intf_names()
- istream_name = io_names["s_axis"][0][0]
- ostream_name = io_names["m_axis"][0][0]
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
io_dict = {
- "inputs": {istream_name: inp},
- "outputs": {ostream_name: []},
+ "inputs": {"in0": rtlsim_inp},
+ "outputs": {"out": []},
}
-
trace_file = self.get_nodeattr("rtlsim_trace")
if trace_file == "default":
trace_file = self.onnx_node.name + ".vcd"
- sname = "_"
-
- # Change into so directory to ensure threshold files can be found
- rtlsim_so = self.get_nodeattr("rtlsim_so")
- so_dir = os.path.dirname(os.path.realpath(rtlsim_so))
- olcwd = os.getcwd()
- os.chdir(so_dir)
- num_out_values = self.get_number_output_values()
- reset_rtlsim(sim)
- total_cycle_count = rtlsim_multi_io(
- sim,
- io_dict,
- num_out_values,
- trace_file=trace_file,
- sname=sname,
- liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
- )
- self.set_nodeattr("cycles_rtlsim", total_cycle_count)
- os.chdir(olcwd)
- output = io_dict["outputs"][ostream_name]
+
+ super().reset_rtlsim(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ rtlsim_output = io_dict["outputs"]["out"]
# Manage output data
odt = self.get_output_datatype()
@@ -471,7 +406,9 @@ def execute_node(self, context, graph):
out_npy_path = "{}/output.npy".format(code_gen_dir)
out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
# load and reshape output
output = np.load(out_npy_path)
@@ -489,10 +426,7 @@ def execute_node(self, context, graph):
def code_generation_ipi(self):
"""Constructs and returns the TCL commands for node instantiation as an RTL
block."""
- rtl_file_list = [
- x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module"))
- for x in self.get_rtl_file_list()
- ]
+ rtl_file_list = self.get_rtl_file_list()
code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name
cmd = ["file mkdir %s" % source_target]
diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
index 32943d86cf..23ba4f5fc9 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
@@ -28,12 +28,11 @@
import numpy as np
import os
-from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk
from qonnx.core.datatype import DataType
from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
-from finn.util.basic import get_rtlsim_trace_depth, is_versal, make_build_dir
+from finn.util.basic import is_versal
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
try:
@@ -95,8 +94,9 @@ def execute_node(self, context, graph):
sim = self.get_rtlsim()
nbits = self.get_instream_width()
inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
- reset_rtlsim(sim)
- toggle_clk(sim)
+ super().reset_rtlsim(sim)
+ if self.get_nodeattr("rtlsim_backend") == "pyverilator":
+ super().toggle_clk(sim)
if mem_mode in ["external", "internal_decoupled"]:
wnbits = self.get_weightstream_width()
@@ -115,10 +115,14 @@ def execute_node(self, context, graph):
"inputs": {"in0": inp, "weights": wei * num_w_reps},
"outputs": {"out": []},
}
- self.rtlsim_multi_io(sim, io_dict)
- output = io_dict["outputs"]["out"]
else:
- output = self.rtlsim(sim, inp)
+ io_dict = {
+ "inputs": {"in0": inp},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ super().close_rtlsim(sim)
+ output = io_dict["outputs"]["out"]
odt = self.get_output_datatype()
target_bits = odt.bitwidth()
packed_bits = self.get_outstream_width()
@@ -274,28 +278,25 @@ def prepare_codegen_default(self, fpgapart, clk):
return template_path, code_gen_dict
- def prepare_rtlsim(self):
- """Creates a Verilator emulation library for the RTL code generated
- for this node, sets the rtlsim_so attribute to its path and returns
- a PyVerilator wrapper around it."""
-
- if PyVerilator is None:
- raise ImportError("Installation of PyVerilator is required.")
-
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- # Path to (System-)Verilog files used by top-module & path to top-module
- verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"]
- verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"]
+ def get_rtl_file_list(self, abspath=False):
+ if abspath:
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
+ rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/")
+ else:
+ code_gen_dir = ""
+ rtllib_dir = ""
- # build the Verilator emu library
- sim = PyVerilator.build(
- verilog_files,
- build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
- verilog_path=verilog_paths,
- trace_depth=get_rtlsim_trace_depth(),
- top_module_name=self.get_verilog_top_module_name(),
- )
- # save generated lib filename in attribute
- self.set_nodeattr("rtlsim_so", sim.lib._name)
+ verilog_files = [
+ code_gen_dir + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v",
+ rtllib_dir + "mvu_vvu_axi.sv",
+ rtllib_dir + "replay_buffer.sv",
+ rtllib_dir + "mvu_4sx4u.sv",
+ rtllib_dir + "mvu_vvu_8sx9_dsp58.sv",
+ rtllib_dir + "mvu_8sx8u_dsp48.sv",
+ ]
+ return verilog_files
- return sim
+ def get_verilog_paths(self):
+ verilog_paths = super().get_verilog_paths()
+ verilog_paths.append(os.environ["FINN_ROOT"] + "/finn-rtllib/mvu")
+ return verilog_paths
diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py
index 2e4d647b22..5aae52ad4b 100644
--- a/src/finn/custom_op/fpgadataflow/rtlbackend.py
+++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py
@@ -28,6 +28,18 @@
from abc import ABC, abstractmethod
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+
+try:
+ from pyverilator import PyVerilator
+except ModuleNotFoundError:
+ PyVerilator = None
+
+try:
+ import pyxsi_utils
+except ModuleNotFoundError:
+ pyxsi_utils = None
+
class RTLBackend(ABC):
"""RTLBackend class all custom ops that correspond to a module in finn-rtllib
@@ -45,8 +57,56 @@ def get_nodeattr_types(self):
def generate_hdl(self, model, fpgapart, clk):
pass
- @abstractmethod
def prepare_rtlsim(self):
+ """Creates a Verilator emulation library for the RTL code generated
+ for this node, sets the rtlsim_so attribute to its path and returns
+ a PyVerilator wrapper around it."""
+
+ if PyVerilator is None:
+ raise ImportError("Installation of PyVerilator is required.")
+
+ verilog_paths = self.get_verilog_paths()
+ rtlsim_backend = self.get_nodeattr("rtlsim_backend")
+ if rtlsim_backend == "pyverilator":
+ if PyVerilator is None:
+ raise ImportError("Installation of PyVerilator is required.")
+ verilog_files = self.get_rtl_file_list(abspath=False)
+
+ # build the Verilator emu library
+ sim = PyVerilator.build(
+ verilog_files,
+ build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+ verilog_path=verilog_paths,
+ trace_depth=get_rtlsim_trace_depth(),
+ top_module_name=self.get_nodeattr("gen_top_module"),
+ )
+ # save generated lib filename in attribute
+ self.set_nodeattr("rtlsim_so", sim.lib._name)
+ elif rtlsim_backend == "pyxsi":
+ verilog_files = self.get_rtl_file_list(abspath=True)
+ single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
+ ret = pyxsi_utils.compile_sim_obj(
+ self.get_verilog_top_module_name(), verilog_files, single_src_dir
+ )
+ # save generated lib filename in attribute
+ self.set_nodeattr("rtlsim_so", ret[0] + "/" + ret[1])
+ # TODO return val of this function is never used
+ # refactor s.t. it does not return anything at all,
+ # consistently between pyverilator and pyxsi
+ sim = None
+ else:
+ assert False, "Unknown rtlsim_backend"
+ return sim
+
+ def get_verilog_paths(self):
+ """Returns path to code gen directory. Can be overwritten to
+ return additional paths to relevant verilog files"""
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ return [code_gen_dir]
+
+ @abstractmethod
+ def get_rtl_file_list(self, abspath=False):
+ """Returns list of rtl files. Needs to be filled by each node."""
pass
@abstractmethod
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..56cb1f991f 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -29,9 +29,12 @@
# template for single node execution
docompute_template = """
+#define HLS_CONSTEXPR_ENABLE
#define AP_INT_MAX_W $AP_INT_MAX_W$
+#define HLS_NO_XIL_FPO_LIB
#include "cnpy.h"
#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
#include
#include "bnn-library.h"
@@ -58,10 +61,57 @@
"""
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
+
# templates for single node ip generation
# cpp file
ipgen_template = """
+#define HLS_CONSTEXPR_ENABLE
#define AP_INT_MAX_W $AP_INT_MAX_W$
#include "bnn-library.h"
diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index 12cb76be4e..12cb96994e 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -133,12 +133,15 @@ def get_weightstream_width(self):
def minimize_accumulator_width(self, model):
"Minimize threshold width ('accumulator width' here due to convention)"
+ idt = self.get_input_datatype()
+ if idt == "FLOAT32" or self.get_nodeattr("weightDataType") == "FLOAT32":
+ return DataType[self.get_nodeattr("weightDataType")]
thresholds = model.get_initializer(self.onnx_node.input[1])
threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds)
min_threshold = thresholds.min()
max_threshold = thresholds.max()
- min_input = self.get_input_datatype().min()
- max_input = self.get_input_datatype().max()
+ min_input = idt.min()
+ max_input = idt.max()
# get range required by threshold values
tdt_min = min(min_input, min_threshold)
tdt_max = max(max_input, max_threshold)
@@ -215,8 +218,6 @@ def get_hw_compatible_threshold_tensor(self, orig_thres_matrix):
if not self.get_input_datatype().signed():
# ensure all thresholds are nonnegative
assert (orig_thres_matrix >= 0).all()
- # ensure all thresholds are integer
- assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor"
ret = orig_thres_matrix
# ensure channels = mh , duplicating if necessary
if ret.shape[0] == 1:
diff --git a/src/finn/qnn-data/cpp/xsi_simdriver.cpp b/src/finn/qnn-data/cpp/xsi_simdriver.cpp
new file mode 100644
index 0000000000..0a9aeded21
--- /dev/null
+++ b/src/finn/qnn-data/cpp/xsi_simdriver.cpp
@@ -0,0 +1,396 @@
+/* Copyright (C) 2024, Advanced Micro Devices, Inc.
+All rights reserved.
+#
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+#
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+#
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+#
+* Neither the name of FINN nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+#
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* C++ streaming rtlsim driver template for Verilog designs using XSI
+ - pushes input data into input AXI stream(s), either dummy or from file
+ - dumps output data from output AXI stream(s) if desired
+ - option to examine final simulation status to capture more info
+
+Note: all code template arguments formatted like @TEMPLATE@ must be filled in
+prior to compilation
+*/
+
+#include
+#include
+#include
+#include
+// currently using the pyxsi version and not the original Vivado version
+#include "xsi_loader.h"
+
+#include
+#include
+#include
+#include
+#include