From 3b1f7af3f127ea1f58c370895215f89cd5fb7013 Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Thu, 4 May 2023 18:44:59 +0200
Subject: [PATCH] Add FP8alt, low and mixed-precision SDOTP with stochastic
 rounding support, and compressed vector cmp (#3)

Added support for:
- FP8alt (1, 4, 3)
- low and mixed-precision SDOTP with stochastic rounding support
- compressed vector compare results (one bit per comparison in the LSBs)

---------

Co-authored-by: Gianna Paulin <pauling@student.ethz.ch>
---
 Bender.yml                          |    3 +
 README.md                           |   17 +
 docs/CHANGELOG-PULP.md              |   15 +
 docs/CHANGELOG.md                   |    5 +
 docs/README.md                      |   71 +-
 src/fpnew_cast_multi.sv             |    8 +-
 src/fpnew_fma.sv                    |    8 +-
 src/fpnew_fma_multi.sv              |    8 +-
 src/fpnew_opgroup_block.sv          |   10 +-
 src/fpnew_opgroup_fmt_slice.sv      |   68 +-
 src/fpnew_opgroup_multifmt_slice.sv |  207 +++-
 src/fpnew_pkg.sv                    |  113 ++-
 src/fpnew_rounding.sv               |   49 +-
 src/fpnew_sdotp_multi.sv            | 1444 +++++++++++++++++++++++++++
 src/fpnew_sdotp_multi_wrapper.sv    |  190 ++++
 src/fpnew_top.sv                    |    8 +-
 src/lfsr_sr.sv                      |  352 +++++++
 src_files.yml                       |    2 +
 18 files changed, 2464 insertions(+), 114 deletions(-)
 create mode 100644 docs/CHANGELOG-PULP.md
 create mode 100644 src/fpnew_sdotp_multi.sv
 create mode 100644 src/fpnew_sdotp_multi_wrapper.sv
 create mode 100644 src/lfsr_sr.sv
diff --git a/Bender.yml b/Bender.yml
index e11f6a40..fff51ec3 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -29,9 +29,12 @@ sources:
   - src/fpnew_divsqrt_multi.sv
   - src/fpnew_fma.sv
   - src/fpnew_fma_multi.sv
+  - src/fpnew_sdotp_multi.sv
+  - src/fpnew_sdotp_multi_wrapper.sv
   - src/fpnew_noncomp.sv
   - src/fpnew_opgroup_block.sv
   - src/fpnew_opgroup_fmt_slice.sv
   - src/fpnew_opgroup_multifmt_slice.sv
   - src/fpnew_rounding.sv
+  - src/lfsr_sr.sv
   - src/fpnew_top.sv
diff --git a/README.md b/README.md
index 0c029030..949bacc4 100644
--- a/README.md
+++ b/README.md
@@ -165,6 +165,23 @@ If you use FPnew in your work, you can cite us:
 }
 ```
 
+If you use FPnew SDOTP in your work, you can cite us:
+
+<details>
+<summary>SDOTP Publication</summary>
+<p>
+
+```
+@inproceedings{bertaccini2022minifloat,
+  title={MiniFloat-NN and ExSdotp: An ISA Extension and a Modular Open Hardware Unit for Low-Precision Training on RISC-V Cores},
+  author={Bertaccini, Luca and Paulin, Gianna and Fischer, Tim and Mach, Stefan and Benini, Luca},
+  booktitle={2022 IEEE 29th Symposium on Computer Arithmetic (ARITH)},
+  pages={1--8},
+  year={2022},
+  organization={IEEE}
+}
+```
+
 </p>
 </details>
 
diff --git a/docs/CHANGELOG-PULP.md b/docs/CHANGELOG-PULP.md
new file mode 100644
index 00000000..196a9c03
--- /dev/null
+++ b/docs/CHANGELOG-PULP.md
@@ -0,0 +1,15 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+
+In this sense, we interpret the "Public API" of a hardware module as its port/parameter list.
+Versions of the IP in the same major relase are "pin-compatible" with each other. Minor relases are permitted to add new parameters as long as their default bindings ensure backwards compatibility.
+
+## [0.1.0] - 2023-05-04
+
+### Added
+- Add low and mixed-precision SDOTP with support for stochastic rounding
+- Add `FP8alt (1,4,3)` format
+- Add support for compressed vector compare results (one bit per comparison in the LSBs)
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 3a3e1f83..f290ed64 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -10,6 +10,11 @@ Versions of the IP in the same major relase are "pin-compatible" with each other
 
 ## [Unreleased]
 
+### Added
+- Add support for alternative FP32-only DivSqrt unit
+
+## [0.7.0] - 2023-03-20
+
 ### Added
 - Citation file `CITATION.cff`
 - Add support for RISC-V compliant classify in vectorial mode when the vector element width is at least 10 bits
diff --git a/docs/README.md b/docs/README.md
index d0c0a91c..542e53e1 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -40,6 +40,8 @@ For more in-depth explanations on how to configure the unit and the layout of th
 | `TagType`        | The SystemVerilog data type of the operation tag                                                                             |
 | `TrueSIMDClass`  | If enabled, the result of a classify operation in vectorial mode will be RISC-V compliant if each output has at least 10 bits|
 | `EnableSIMDMask` | Enable the RISC-V floating-point status flags masking of inactive vectorial lanes. When disabled, `simd_mask_i` is inactive  |
+| `StochasticRndImplementation` | Enable stochastic rounding support for SDOTP, define LFSR bitwidth and number of trailing bits considered for the SR decision  |
+| `CompressedVecCmpResult` | Compress the result of a vector compare in the LSBs, conceived for RV32FD cores                                      |
 
 ### Ports
 
@@ -50,6 +52,7 @@ As the width of some input/output signals is defined by the configuration, it is
 |------------------|-----------|----------------------|----------------------------------------------------------------|
 | `clk_i`          | in        | `logic`              | Clock, synchronous, rising-edge triggered                      |
 | `rst_ni`         | in        | `logic`              | Asynchronous reset, active low                                 |
+| `hart_id_i`      | in        | `logic [31:0]`       | Core ID, used only when stochastic rounding is enabled         |
 | `operands_i`     | in        | `logic [2:0][W-1:0]` | Operands, henceforth referred to as `op[`*i*`]`                |
 | `rnd_mode_i`     | in        | `roundmode_e`        | Floating-point rounding mode                                   |
 | `op_i`           | in        | `operation_e`        | Operation select                                               |
@@ -79,15 +82,16 @@ Default values from the package are listed.
 
 Enumeration of type `logic [2:0]` holding available rounding modes, encoded for use in RISC-V cores:
 
-| Enumerator |  Value   |                    Rounding Mode                     |
-|------------|----------|------------------------------------------------------|
-| `RNE`      | `3'b000` | To nearest, tie to even (default)                    |
-| `RTZ`      | `3'b001` | Toward zero                                          |
-| `RDN`      | `3'b010` | Toward negative infinity                             |
-| `RUP`      | `3'b011` | Toward positive infinity                             |
-| `RMM`      | `3'b100` | To nearest, tie away from zero                       |
-| `ROD`      | `3'b101` | To odd                                               |
-| `DYN`      | `3'b111` | *RISC-V Dynamic RM, invalid if passed to operations* |
+| Enumerator |  Value   |                    Rounding Mode                         |
+|------------|----------|----------------------------------------------------------|
+| `RNE`      | `3'b000` | To nearest, tie to even (default)                        |
+| `RTZ`      | `3'b001` | Toward zero                                              |
+| `RDN`      | `3'b010` | Toward negative infinity                                 |
+| `RUP`      | `3'b011` | Toward positive infinity                                 |
+| `RMM`      | `3'b100` | To nearest, tie away from zero                           |
+| `ROD`      | `3'b101` | To odd                                                   |
+| `RSR`      | `3'b110` | Stochastic Rounding (available only on SDOTP operations) |
+| `DYN`      | `3'b111` | *RISC-V Dynamic RM, invalid if passed to operations*     |
 
 ##### `operation_e` - FP Operation
 
@@ -104,6 +108,8 @@ Unless noted otherwise, the first operand `op[0]` is used for the operation.
 | `ADD`      | `0`      | Addition (`op[1] + op[2]`) *note the operand indices*                                                                                                                                                            |
 | `ADD`      | `1`      | Subtraction (`op[1] - op[2]`) *note the operand indices*                                                                                                                                                         |
 | `MUL`      | `0`      | Multiplication (`op[0] * op[1]`)                                                                                                                                                                                 |
+| `SDOTP`    | `0`      | Sum of dot product )                                                                                                                                                                                 |
+| `VSUM`     | `0`      | Vector Inner Sum )                                                                                                                                                                                 |
 | `DIV`      | `0`      | Division (`op[0] / op[1]`)                                                                                                                                                                                       |
 | `SQRT`     | `0`      | Square root                                                                                                                                                                                                      |
 | `SGNJ`     | `0`      | Sign injection, operation encoded in rounding mode<br>`RNE`: `op[0]` with `sign(op[1])`<br>`RTZ`: `op[0]` with `~sign(op[1])`<br>`RDN`: `op[0]` with `sign(op[0]) ^ sign(op[1])`<br>`RUP`: `op[0]` (passthrough) |
@@ -132,10 +138,11 @@ Enumeration of type `logic [2:0]` holding the supported FP formats.
 | `FP16`     | IEEE binary16 | 16 bit | 5         | 10        |
 | `FP8`      | binary8       | 8 bit  | 5         | 2         |
 | `FP16ALT`  | binary16alt   | 16 bit | 8         | 7         |
+| `FP8ALT`   | binary8alt    | 8 bit  | 4         | 3         |
 
 The following global parameters associated with FP formats are set in `fpnew_pkg`:
 ```SystemVerilog
-localparam int unsigned NUM_FP_FORMATS = 5;
+localparam int unsigned NUM_FP_FORMATS = 6;
 localparam int unsigned FP_FORMAT_BITS = $clog2(NUM_FP_FORMATS);
 ```
 
@@ -230,7 +237,7 @@ typedef struct packed {
 ```
 The fields of this struct behave as follows:
 
-##### `Width` - Datapath Wdith
+##### `Width` - Datapath Width
 
 Specifies the width of the FPU datapath and of the input and output data ports (`operands_i`/`result_o`).
 It must be larger or equal to the width of the widest enabled FP and integer format.
@@ -278,7 +285,7 @@ Otherwise, synthesis tools can optimize away any logic associated with this form
 
 #### `Implementation` - Implementation Options
 
-The FPU is divided into four operation groups,  `ADDMUL`, `DIVSQRT`, `NONDOMP`, and `CONV` (see [Architecture: Top-Level](#top-level)).
+The FPU is divided into five operation groups,  `ADDMUL`, `DIVSQRT`, `NONDOMP`, `CONV`, and `DOTP` (see [Architecture: Top-Level](#top-level)).
 The `Implementation` parameter controls the implementation of these operation groups.
 It is of type `fpu_implementation_t` which is defined as:
 ```SystemVerilog
@@ -320,17 +327,18 @@ The unit type `unit_type_t` is an enumeration of type `logic [1:0]` holding the
 The `UnitTypes` parameter allows to control resources used for the FPU by either removing operation units for certain formats and operations, or merging multiple formats into one.
 Currently, the follwoing unit types are available for the FPU operation groups:
 
-|            |      `ADDMUL`      |     `DIVSQRT`      |     `NONCOMP`      |       `CONV`       |
-|------------|--------------------|--------------------|--------------------|--------------------|
-| `PARALLEL` | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-| `MERGED`   | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |
+|            |      `ADDMUL`      |     `DIVSQRT`      |     `NONCOMP`      |       `CONV`       |       `DOTP`       |
+|------------|--------------------|--------------------|--------------------|--------------------|--------------------|
+| `PARALLEL` | :heavy_check_mark: |                    | :heavy_check_mark: |                    |                    |
+| `MERGED`   | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: |
 
 *Default*:
 ```SystemVerilog
 '{'{default: PARALLEL}, // ADDMUL
   '{default: MERGED},   // DIVSQRT
   '{default: PARALLEL}, // NONCOMP
-  '{default: MERGED}}   // CONV`
+  '{default: MERGED},   // CONV`
+  '{default: DISABLED}} // DOTP`
 ```
 (all formats within operation group use same type)
 
@@ -350,7 +358,33 @@ The configuration  `pipe_config_t` is an enumeration of type `logic [1:0]` holdi
 | `INSIDE`      | All registers are inserted at roughly the middle of the operational unit (if not possible, `BEFORE`) |
 | `DISTRIBUTED` | Registers are evenly distributed to `INSIDE`, `BEFORE`, and `AFTER` (if no `INSIDE`, all `BEFORE`)   |
 
+### `Stochastic Rounding Implementation`
 
+The `StochasticRndImplementation` parameter is used to configure the RSR support.
+It is of type `rsr_impl_t` which is defined as:
+```SystemVerilog
+typedef struct packed {
+  logic        EnableRSR;
+  int unsigned RsrPrecision;
+  int unsigned LfsrInternalPrecision;
+} rsr_impl_t;
+```
+The fields of this struct behave as follows:
+
+##### `EnableRSR` - Enable RSR support
+Enables stochastic rounding support in the `DOTP` operation group block. It instantiates an `LFSR` in the rounding module.
+
+*Default*: `1'b0`
+
+##### `RsrPrecision`
+Specifies the number of trailing bits considered for the stochastic rounding decision.
+
+*Default*: `12`
+
+##### `LfsrInternalPrecision`
+Specifies the LFSR internal bitwidth, thus controlling the pseudorandom number periodicity.
+
+*Default*: `32`
 
 ### Adding Custom Formats
 
@@ -391,7 +425,7 @@ The *operation group* is the highest level of grouping within FPnew and signifie
 
 ![FPnew](fig/top_block.png)
 
-There are currently four operation groups in FPnew which are enumerated in `opgroup_e` as outlined in the following table:
+There are currently five operation groups in FPnew which are enumerated in `opgroup_e` as outlined in the following table:
 
 | Enumerator |                  Description                  |         Associated Operations         |
 |------------|-----------------------------------------------|---------------------------------------|
@@ -399,6 +433,7 @@ There are currently four operation groups in FPnew which are enumerated in `opgr
 | `DIVSQRT`  | Division and Square Root                      | `DIV`, `SQRT`                         |
 | `NONCOMP`  | Non-Computational Operations like Comparisons | `SGNJ`, `MINMAX`, `CMP`, `CLASS`      |
 | `CONV`     | Conversions                                   | `F2I`, `I2F`, `F2F`, `CPKAB`, `CPKCD` |
+| `DOTP`     | Dot Products                                  | `SDOTP`, `EXVSUM`, `VSUM`             |
 
 Most architectural decisions for FPnew are made at very fine granularity.
 The big exception to this is the generation of vectorial hardware which is decided at top level through the `EnableVectors` parameter.
diff --git a/src/fpnew_cast_multi.sv b/src/fpnew_cast_multi.sv
index e166d0bf..fca5f3b6 100644
--- a/src/fpnew_cast_multi.sv
+++ b/src/fpnew_cast_multi.sv
@@ -544,11 +544,17 @@ module fpnew_cast_multi #(
   assign pre_round_abs = dst_is_int_q ? ifmt_pre_round_abs[int_fmt_q2] : fmt_pre_round_abs[dst_fmt_q2];
 
   fpnew_rounding #(
-    .AbsWidth ( WIDTH )
+    .AbsWidth ( WIDTH ),
+    .EnableRSR ( 0 )
   ) i_fpnew_rounding (
+    .clk_i,
+    .rst_ni,
+    .id_i                    ( '0                ),
+    .en_rsr_i                ( 1'b0              ),
     .abs_value_i             ( pre_round_abs     ),
     .sign_i                  ( input_sign_q      ), // source format
     .round_sticky_bits_i     ( round_sticky_bits ),
+    .stochastic_rounding_bits_i ( '0             ),
     .rnd_mode_i              ( rnd_mode_q        ),
     .effective_subtraction_i ( 1'b0              ), // no operation happened
     .abs_rounded_o           ( rounded_abs       ),
diff --git a/src/fpnew_fma.sv b/src/fpnew_fma.sv
index c29e7b3e..d725a5d1 100644
--- a/src/fpnew_fma.sv
+++ b/src/fpnew_fma.sv
@@ -597,11 +597,17 @@ module fpnew_fma #(
 
   // Perform the rounding
   fpnew_rounding #(
-    .AbsWidth ( EXP_BITS + MAN_BITS )
+    .AbsWidth  ( EXP_BITS + MAN_BITS ),
+    .EnableRSR ( 0 )
   ) i_fpnew_rounding (
+    .clk_i,
+    .rst_ni,
+    .id_i                    ( '0                      ),
+    .en_rsr_i                ( 1'b0                    ),
     .abs_value_i             ( pre_round_abs           ),
     .sign_i                  ( pre_round_sign          ),
     .round_sticky_bits_i     ( round_sticky_bits       ),
+    .stochastic_rounding_bits_i ( '0                   ),
     .rnd_mode_i              ( rnd_mode_q              ),
     .effective_subtraction_i ( effective_subtraction_q ),
     .abs_rounded_o           ( rounded_abs             ),
diff --git a/src/fpnew_fma_multi.sv b/src/fpnew_fma_multi.sv
index cceeae3c..e2320846 100644
--- a/src/fpnew_fma_multi.sv
+++ b/src/fpnew_fma_multi.sv
@@ -720,11 +720,17 @@ module fpnew_fma_multi #(
 
   // Perform the rounding
   fpnew_rounding #(
-    .AbsWidth ( SUPER_EXP_BITS + SUPER_MAN_BITS )
+    .AbsWidth  ( SUPER_EXP_BITS + SUPER_MAN_BITS ),
+    .EnableRSR ( 0 )
   ) i_fpnew_rounding (
+    .clk_i,
+    .rst_ni,
+    .id_i                    ( '0                      ),
+    .en_rsr_i                ( 1'b0                    ),
     .abs_value_i             ( pre_round_abs           ),
     .sign_i                  ( pre_round_sign          ),
     .round_sticky_bits_i     ( round_sticky_bits       ),
+    .stochastic_rounding_bits_i ( '0                   ),
     .rnd_mode_i              ( rnd_mode_q              ),
     .effective_subtraction_i ( effective_subtraction_q ),
     .abs_rounded_o           ( rounded_abs             ),
diff --git a/src/fpnew_opgroup_block.sv b/src/fpnew_opgroup_block.sv
index 3b50bec9..e04f72d0 100644
--- a/src/fpnew_opgroup_block.sv
+++ b/src/fpnew_opgroup_block.sv
@@ -26,6 +26,8 @@ module fpnew_opgroup_block #(
   parameter fpnew_pkg::pipe_config_t    PipeConfig    = fpnew_pkg::BEFORE,
   parameter type                        TagType       = logic,
   parameter int unsigned                TrueSIMDClass = 0,
+  parameter logic                       CompressedVecCmpResult = 0,
+  parameter fpnew_pkg::rsr_impl_t       StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
   // Do not change
   localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS,
   localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
@@ -34,6 +36,7 @@ module fpnew_opgroup_block #(
 ) (
   input logic                                     clk_i,
   input logic                                     rst_ni,
+  input logic [31:0]                              hart_id_i,
   // Input signals
   input logic [NUM_OPERANDS-1:0][Width-1:0]       operands_i,
   input logic [NUM_FORMATS-1:0][NUM_OPERANDS-1:0] is_boxed_i,
@@ -110,7 +113,8 @@ module fpnew_opgroup_block #(
         .NumPipeRegs   ( FmtPipeRegs[fmt]             ),
         .PipeConfig    ( PipeConfig                   ),
         .TagType       ( TagType                      ),
-        .TrueSIMDClass ( TrueSIMDClass                )
+        .TrueSIMDClass ( TrueSIMDClass                ),
+        .CompressedVecCmpResult ( CompressedVecCmpResult )
       ) i_fmt_slice (
         .clk_i,
         .rst_ni,
@@ -182,10 +186,12 @@ module fpnew_opgroup_block #(
       .PulpDivsqrt   ( PulpDivsqrt      ),
       .NumPipeRegs   ( REG              ),
       .PipeConfig    ( PipeConfig       ),
-      .TagType       ( TagType          )
+      .TagType       ( TagType          ),
+      .StochasticRndImplementation ( StochasticRndImplementation )
     ) i_multifmt_slice (
       .clk_i,
       .rst_ni,
+      .hart_id_i,
       .operands_i,
       .is_boxed_i,
       .rnd_mode_i,
diff --git a/src/fpnew_opgroup_fmt_slice.sv b/src/fpnew_opgroup_fmt_slice.sv
index 35fbe484..734ddf63 100644
--- a/src/fpnew_opgroup_fmt_slice.sv
+++ b/src/fpnew_opgroup_fmt_slice.sv
@@ -23,6 +23,7 @@ module fpnew_opgroup_fmt_slice #(
   parameter fpnew_pkg::pipe_config_t PipeConfig    = fpnew_pkg::BEFORE,
   parameter type                     TagType       = logic,
   parameter int unsigned             TrueSIMDClass = 0,
+  parameter logic                    CompressedVecCmpResult = 0,
   // Do not change
   localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
   localparam int unsigned NUM_LANES    = fpnew_pkg::num_lanes(Width, FpFormat, EnableVectors),
@@ -57,28 +58,36 @@ module fpnew_opgroup_fmt_slice #(
 
   localparam int unsigned FP_WIDTH  = fpnew_pkg::fp_width(FpFormat);
   localparam int unsigned SIMD_WIDTH = unsigned'(Width/NUM_LANES);
-
+  localparam int unsigned AUX_BITS = 2;
 
   logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid; // Handshake signals for the lanes
-  logic                 vectorial_op;
+  logic                 vectorial_op, cmp_op;
 
   logic [NUM_LANES*FP_WIDTH-1:0] slice_result;
   logic [Width-1:0]              slice_regular_result, slice_class_result, slice_vec_class_result;
+  logic [NUM_LANES-1:0]          slice_cmp_result;
 
   fpnew_pkg::status_t    [NUM_LANES-1:0] lane_status;
   logic                  [NUM_LANES-1:0] lane_ext_bit; // only the first one is actually used
   fpnew_pkg::classmask_e [NUM_LANES-1:0] lane_class_mask;
   TagType                [NUM_LANES-1:0] lane_tags; // only the first one is actually used
   logic                  [NUM_LANES-1:0] lane_masks;
-  logic                  [NUM_LANES-1:0] lane_vectorial, lane_busy, lane_is_class; // dito
+  logic                  [NUM_LANES-1:0] lane_busy, lane_is_class; // dito
+  logic    [NUM_LANES-1:0][AUX_BITS-1:0] lane_aux; // dito
+
+  logic result_is_vector, result_is_class, result_is_cmp;
 
-  logic result_is_vector, result_is_class;
+  fpnew_pkg::roundmode_e rnd_mode;
 
   // -----------
   // Input Side
   // -----------
+  // RSR supported only on SDOTP module
+  assign rnd_mode = (rnd_mode_i == fpnew_pkg::RSR) ? fpnew_pkg::RNE : rnd_mode_i;
+
   assign in_ready_o   = lane_in_ready[0]; // Upstream ready is given by first lane
   assign vectorial_op = vectorial_op_i & EnableVectors; // only do vectorial stuff if enabled
+  assign cmp_op       = (op_i == fpnew_pkg::CMP);
 
   // ---------------
   // Generate Lanes
@@ -94,7 +103,9 @@ module fpnew_opgroup_fmt_slice #(
       logic [NUM_OPERANDS-1:0][FP_WIDTH-1:0] local_operands; // lane-local operands
       logic [FP_WIDTH-1:0]                   op_result;      // lane-local results
       fpnew_pkg::status_t                    op_status;
+      logic [AUX_BITS-1:0]                   local_aux_data_input;
 
+      assign local_aux_data_input = {vectorial_op, cmp_op};
       assign in_valid = in_valid_i & ((lane == 0) | vectorial_op); // upper lanes only for vectors
       // Slice out the operands for this lane
       always_comb begin : prepare_input
@@ -106,22 +117,22 @@ module fpnew_opgroup_fmt_slice #(
       // Instantiate the operation from the selected opgroup
       if (OpGroup == fpnew_pkg::ADDMUL) begin : lane_instance
         fpnew_fma #(
-          .FpFormat    ( FpFormat    ),
-          .NumPipeRegs ( NumPipeRegs ),
-          .PipeConfig  ( PipeConfig  ),
-          .TagType     ( TagType     ),
-          .AuxType     ( logic       )
+          .FpFormat    ( FpFormat             ),
+          .NumPipeRegs ( NumPipeRegs          ),
+          .PipeConfig  ( PipeConfig           ),
+          .TagType     ( TagType              ),
+          .AuxType     ( logic [AUX_BITS-1:0] )
         ) i_fma (
           .clk_i,
           .rst_ni,
           .operands_i      ( local_operands               ),
           .is_boxed_i      ( is_boxed_i[NUM_OPERANDS-1:0] ),
-          .rnd_mode_i,
+          .rnd_mode_i      ( rnd_mode             ),
           .op_i,
           .op_mod_i,
           .tag_i,
           .mask_i          ( simd_mask_i[lane]    ),
-          .aux_i           ( vectorial_op         ), // Remember whether operation was vectorial
+          .aux_i           ( local_aux_data_input ), // Remember whether operation was vectorial
           .in_valid_i      ( in_valid             ),
           .in_ready_o      ( lane_in_ready[lane]  ),
           .flush_i,
@@ -130,7 +141,7 @@ module fpnew_opgroup_fmt_slice #(
           .extension_bit_o ( lane_ext_bit[lane]   ),
           .tag_o           ( lane_tags[lane]      ),
           .mask_o          ( lane_masks[lane]     ),
-          .aux_o           ( lane_vectorial[lane] ),
+          .aux_o           ( lane_aux[lane]       ),
           .out_valid_o     ( out_valid            ),
           .out_ready_i     ( out_ready            ),
           .busy_o          ( lane_busy[lane]      )
@@ -149,7 +160,7 @@ module fpnew_opgroup_fmt_slice #(
         //   .rst_ni,
         //   .operands_i      ( local_operands               ),
         //   .is_boxed_i      ( is_boxed_i[NUM_OPERANDS-1:0] ),
-        //   .rnd_mode_i,
+        //   .rnd_mode_i      ( rnd_mode            ),
         //   .op_i,
         //   .op_mod_i,
         //   .tag_i,
@@ -161,7 +172,7 @@ module fpnew_opgroup_fmt_slice #(
         //   .status_o        ( op_status            ),
         //   .extension_bit_o ( lane_ext_bit[lane]   ),
         //   .tag_o           ( lane_tags[lane]      ),
-        //   .aux_o           ( lane_vectorial[lane] ),
+        //   .aux_o           ( lane_aux[lane]       ),
         //   .out_valid_o     ( out_valid            ),
         //   .out_ready_i     ( out_ready            ),
         //   .busy_o          ( lane_busy[lane]      )
@@ -169,22 +180,22 @@ module fpnew_opgroup_fmt_slice #(
         // assign lane_is_class[lane] = 1'b0;
       end else if (OpGroup == fpnew_pkg::NONCOMP) begin : lane_instance
         fpnew_noncomp #(
-          .FpFormat   (FpFormat),
-          .NumPipeRegs(NumPipeRegs),
-          .PipeConfig (PipeConfig),
-          .TagType    (TagType),
-          .AuxType    (logic)
+          .FpFormat   ( FpFormat             ),
+          .NumPipeRegs( NumPipeRegs          ),
+          .PipeConfig ( PipeConfig           ),
+          .TagType    ( TagType              ),
+          .AuxType    ( logic [AUX_BITS-1:0] )
         ) i_noncomp (
           .clk_i,
           .rst_ni,
           .operands_i      ( local_operands               ),
           .is_boxed_i      ( is_boxed_i[NUM_OPERANDS-1:0] ),
-          .rnd_mode_i,
+          .rnd_mode_i      ( rnd_mode              ),
           .op_i,
           .op_mod_i,
           .tag_i,
           .mask_i          ( simd_mask_i[lane]     ),
-          .aux_i           ( vectorial_op          ), // Remember whether operation was vectorial
+          .aux_i           ( local_aux_data_input  ), // Remember whether operation was vectorial
           .in_valid_i      ( in_valid              ),
           .in_ready_o      ( lane_in_ready[lane]   ),
           .flush_i,
@@ -195,7 +206,7 @@ module fpnew_opgroup_fmt_slice #(
           .is_class_o      ( lane_is_class[lane]   ),
           .tag_o           ( lane_tags[lane]       ),
           .mask_o          ( lane_masks[lane]      ),
-          .aux_o           ( lane_vectorial[lane]  ),
+          .aux_o           ( lane_aux[lane]        ),
           .out_valid_o     ( out_valid             ),
           .out_ready_i     ( out_ready             ),
           .busy_o          ( lane_busy[lane]       )
@@ -223,6 +234,9 @@ module fpnew_opgroup_fmt_slice #(
     // Insert lane result into slice result
     assign slice_result[(unsigned'(lane)+1)*FP_WIDTH-1:unsigned'(lane)*FP_WIDTH] = local_result;
 
+    // Insert lane result into slice result for CMP operations
+    assign slice_cmp_result[unsigned'(lane)] = local_result[0];
+
     // Create Classification results
     if (TrueSIMDClass && SIMD_WIDTH >= 10) begin : vectorial_true_class // true vectorial class blocks are 10bits in size
       assign slice_vec_class_result[lane*SIMD_WIDTH +: 10] = lane_class_mask[lane];
@@ -253,7 +267,8 @@ module fpnew_opgroup_fmt_slice #(
   // ------------
   // Output Side
   // ------------
-  assign result_is_vector = lane_vectorial[0];
+  assign result_is_vector = lane_aux[0][1];
+  assign result_is_cmp    = lane_aux[0][0];
   assign result_is_class  = lane_is_class[0];
 
   assign slice_regular_result = $signed({extension_bit_o, slice_result});
@@ -272,7 +287,12 @@ module fpnew_opgroup_fmt_slice #(
   assign slice_class_result = result_is_vector ? slice_vec_class_result : lane_class_mask[0];
 
   // Select the proper result
-  assign result_o = result_is_class ? slice_class_result : slice_regular_result;
+  if (CompressedVecCmpResult) begin
+    assign result_o = result_is_class ? slice_class_result     :
+                      result_is_cmp   ? {'0, slice_cmp_result} : slice_regular_result;
+  end else begin
+    assign result_o = result_is_class ? slice_class_result : slice_regular_result;
+  end
 
   assign extension_bit_o                              = lane_ext_bit[0]; // upper lanes unused
   assign tag_o                                        = lane_tags[0];    // upper lanes unused
diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv
index c5365c82..9c889a31 100644
--- a/src/fpnew_opgroup_multifmt_slice.sv
+++ b/src/fpnew_opgroup_multifmt_slice.sv
@@ -26,6 +26,7 @@ module fpnew_opgroup_multifmt_slice #(
   parameter int unsigned             NumPipeRegs   = 0,
   parameter fpnew_pkg::pipe_config_t PipeConfig    = fpnew_pkg::BEFORE,
   parameter type                     TagType       = logic,
+  parameter fpnew_pkg::rsr_impl_t    StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
   // Do not change
   localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
   localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS,
@@ -34,6 +35,7 @@ module fpnew_opgroup_multifmt_slice #(
 ) (
   input logic                                     clk_i,
   input logic                                     rst_ni,
+  input logic [31:0]                              hart_id_i,
   // Input signals
   input logic [NUM_OPERANDS-1:0][Width-1:0]       operands_i,
   input logic [NUM_FORMATS-1:0][NUM_OPERANDS-1:0] is_boxed_i,
@@ -69,14 +71,23 @@ Set PulpDivsqrt to 1 not to use the PULP DivSqrt unit \
 or set Features.FpFmtMask to support only FP32");
   end
 
+  if ((OpGroup == fpnew_pkg::DOTP) &&
+      !(FpFmtConfig[0] && (FpFmtConfig[2] || FpFmtConfig[4]) && (FpFmtConfig[3] || FpFmtConfig[5]))) begin
+    $fatal(1, "SDOTP only supported on 32b and 64b CVFPU instances in which at \
+least one 16b and one 8b format are supported. \
+The SDOTP operations compute on 8b inputs producing 16b outputs \
+or on 16b inputs producing 32b outputs");
+  end
+
   localparam int unsigned MAX_FP_WIDTH   = fpnew_pkg::max_fp_width(FpFmtConfig);
   localparam int unsigned MAX_INT_WIDTH  = fpnew_pkg::max_int_width(IntFmtConfig);
   localparam int unsigned NUM_LANES = fpnew_pkg::max_num_lanes(Width, FpFmtConfig, 1'b1);
+  localparam int unsigned NUM_DOTP_LANES = fpnew_pkg::num_dotp_lanes(Width, FpFmtConfig);
   localparam int unsigned NUM_INT_FORMATS = fpnew_pkg::NUM_INT_FORMATS;
   // We will send the format information along with the data
   localparam int unsigned FMT_BITS =
       fpnew_pkg::maximum($clog2(NUM_FORMATS), $clog2(NUM_INT_FORMATS));
-  localparam int unsigned AUX_BITS = FMT_BITS + 2; // also add vectorial and integer flags
+  localparam int unsigned AUX_BITS = FMT_BITS + 4; // also add vectorial and integer flags
 
   logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid, divsqrt_done, divsqrt_ready; // Handshake signals for the lanes
   logic                 vectorial_op;
@@ -86,11 +97,13 @@ or set Features.FpFmtMask to support only FP32");
   // additional flags for CONV
   logic       dst_fmt_is_int, dst_is_cpk;
   logic [1:0] dst_vec_op; // info for vectorial results (for packing)
-  logic [2:0] target_aux_d;
+  logic [1:0] target_aux_d, target_aux_q;
   logic       is_up_cast, is_down_cast;
 
-  logic [NUM_FORMATS-1:0][Width-1:0]     fmt_slice_result;
-  logic [NUM_INT_FORMATS-1:0][Width-1:0] ifmt_slice_result;
+  logic [NUM_FORMATS-1:0][Width-1:0]      fmt_slice_result;
+  logic [NUM_INT_FORMATS-1:0][Width-1:0]  ifmt_slice_result;
+  logic [NUM_FORMATS-1:0][3:0][Width-1:0] fmt_conv_cpk_result;
+
 
   logic [Width-1:0] conv_target_d, conv_target_q; // vectorial conversions update a register
 
@@ -101,16 +114,20 @@ or set Features.FpFmtMask to support only FP32");
   logic   [NUM_LANES-1:0][AUX_BITS-1:0] lane_aux; // only the first one is actually used
   logic   [NUM_LANES-1:0]               lane_busy; // dito
 
-  logic                result_is_vector;
+  logic                result_is_vector, result_is_vsum, op_is_vsum;
   logic [FMT_BITS-1:0] result_fmt;
   logic                result_fmt_is_int, result_is_cpk;
   logic [1:0]          result_vec_op; // info for vectorial results (for packing)
 
   logic simd_synch_rdy, simd_synch_done;
+  fpnew_pkg::roundmode_e rnd_mode;
 
   // -----------
   // Input Side
   // -----------
+  // RSR supported only on SDOTP module
+  assign rnd_mode = (rnd_mode_i == fpnew_pkg::RSR) ? fpnew_pkg::RNE : rnd_mode_i;
+
   assign in_ready_o   = lane_in_ready[0]; // Upstream ready is given by first lane
   assign vectorial_op = vectorial_op_i & EnableVectors; // only do vectorial stuff if enabled
 
@@ -118,17 +135,18 @@ or set Features.FpFmtMask to support only FP32");
   assign dst_fmt_is_int = (OpGroup == fpnew_pkg::CONV) & (op_i == fpnew_pkg::F2I);
   assign dst_is_cpk     = (OpGroup == fpnew_pkg::CONV) & (op_i == fpnew_pkg::CPKAB ||
                                                           op_i == fpnew_pkg::CPKCD);
-  assign dst_vec_op     = (OpGroup == fpnew_pkg::CONV) & {(op_i == fpnew_pkg::CPKCD), op_mod_i};
+  assign dst_vec_op     = {2{(OpGroup == fpnew_pkg::CONV)}} & {(op_i == fpnew_pkg::CPKCD), op_mod_i};
 
   assign is_up_cast   = (fpnew_pkg::fp_width(dst_fmt_i) > fpnew_pkg::fp_width(src_fmt_i));
   assign is_down_cast = (fpnew_pkg::fp_width(dst_fmt_i) < fpnew_pkg::fp_width(src_fmt_i));
+  assign op_is_vsum   = op_i == fpnew_pkg::VSUM ? 1'b1 : 1'b0;
 
   // The destination format is the int format for F2I casts
   assign dst_fmt    = dst_fmt_is_int ? int_fmt_i : dst_fmt_i;
 
   // The data sent along consists of the vectorial flag and format bits
-  assign aux_data      = {dst_fmt_is_int, vectorial_op, dst_fmt};
-  assign target_aux_d  = {dst_vec_op, dst_is_cpk};
+  assign aux_data      = {dst_is_cpk, dst_fmt_is_int, vectorial_op, dst_fmt, op_is_vsum};
+  assign target_aux_d  = dst_vec_op;
 
   // CONV passes one operand for assembly after the unit: opC for cpk, opB for others
   if (OpGroup == fpnew_pkg::CONV) begin : conv_target
@@ -165,35 +183,49 @@ or set Features.FpFmtMask to support only FP32");
         fpnew_pkg::get_conv_lane_int_formats(Width, FpFmtConfig, IntFmtConfig, LANE);
     localparam int unsigned CONV_WIDTH = fpnew_pkg::max_fp_width(CONV_FORMATS);
 
+    // Dotp-specific parameters
+    localparam fpnew_pkg::fmt_logic_t DOTP_FORMATS =
+        fpnew_pkg::get_dotp_lane_formats(Width, FpFmtConfig, LANE);
+    localparam int unsigned DOTP_MAX_FMT_WIDTH = fpnew_pkg::max_fp_width(DOTP_FORMATS);
+    localparam int unsigned DOTP_WIDTH = fpnew_pkg::minimum(2*DOTP_MAX_FMT_WIDTH, Width);
+
     // Lane parameters from Opgroup
-    localparam fpnew_pkg::fmt_logic_t LANE_FORMATS = (OpGroup == fpnew_pkg::CONV)
-                                                     ? CONV_FORMATS : ACTIVE_FORMATS;
-    localparam int unsigned LANE_WIDTH = (OpGroup == fpnew_pkg::CONV) ? CONV_WIDTH : MAX_WIDTH;
+    localparam fpnew_pkg::fmt_logic_t LANE_FORMATS = (OpGroup == fpnew_pkg::CONV) ? CONV_FORMATS :
+                                                     (OpGroup == fpnew_pkg::DOTP) ? DOTP_FORMATS :
+                                                                                    ACTIVE_FORMATS;
+    localparam int unsigned LANE_WIDTH = (OpGroup == fpnew_pkg::CONV) ? CONV_WIDTH :
+                                         (OpGroup == fpnew_pkg::DOTP) ? DOTP_WIDTH : MAX_WIDTH;
 
     logic [LANE_WIDTH-1:0] local_result; // lane-local results
 
     // Generate instances only if needed, lane 0 always generated
-    if ((lane == 0) || EnableVectors) begin : active_lane
+    if ((lane == 0) || (EnableVectors & !(OpGroup == fpnew_pkg::DOTP && (lane >= NUM_DOTP_LANES)))) begin : active_lane
       logic in_valid, out_valid, out_ready; // lane-local handshake
 
       logic [NUM_OPERANDS-1:0][LANE_WIDTH-1:0] local_operands;  // lane-local oprands
       logic [LANE_WIDTH-1:0]                   op_result;       // lane-local results
       fpnew_pkg::status_t                      op_status;
 
-      assign in_valid = in_valid_i & ((lane == 0) | vectorial_op); // upper lanes only for vectors
+      logic lane_is_used;
+      assign lane_is_used = (ACTIVE_FORMATS[src_fmt_i] & ~is_up_cast) |
+                            (ACTIVE_FORMATS[dst_fmt_i] &  is_up_cast) | (OpGroup == fpnew_pkg::DIVSQRT);
+      assign in_valid = in_valid_i & ((lane == 0) | vectorial_op) & lane_is_used; // upper lanes only for vectors
 
       // Slice out the operands for this lane, upper bits are ignored in the unit
       always_comb begin : prepare_input
         for (int unsigned i = 0; i < NUM_OPERANDS; i++) begin
-          if (i == 2) begin
-            local_operands[i] = operands_i[i] >> LANE*fpnew_pkg::fp_width(dst_fmt_i);
-          end else begin
-            local_operands[i] = operands_i[i] >> LANE*fpnew_pkg::fp_width(src_fmt_i);
-          end
+          local_operands[i] = operands_i[i] >> LANE*fpnew_pkg::fp_width(src_fmt_i);
         end
 
-        // override operand 0 for some conversions
-        if (OpGroup == fpnew_pkg::CONV) begin
+        if (OpGroup == fpnew_pkg::DOTP) begin
+          for (int unsigned i = 0; i < NUM_OPERANDS; i++) begin
+            if (i == 2) begin
+              local_operands[i] = operands_i[i] >> LANE*fpnew_pkg::fp_width(dst_fmt_i); // expanded format the width of dst_fmt
+            end else begin
+              local_operands[i] = operands_i[i] >> LANE*2*fpnew_pkg::fp_width(src_fmt_i); // twice the width of src_fmt
+            end
+          end
+        end else if (OpGroup == fpnew_pkg::CONV) begin // override operand 0 for some conversions
           // Source is an integer
           if (op_i == fpnew_pkg::I2F) begin
             local_operands[0] = operands_i[0] >> LANE*fpnew_pkg::int_width(int_fmt_i);
@@ -206,7 +238,7 @@ or set Features.FpFmtMask to support only FP32");
           // CPK
           end else if (dst_is_cpk) begin
             if (lane == 1) begin
-              local_operands[0] = operands_i[1][LANE_WIDTH-1:0]; // using opB as second argument
+              local_operands[0] = operands_i[1];
             end
           end
         end
@@ -225,6 +257,42 @@ or set Features.FpFmtMask to support only FP32");
           .rst_ni,
           .operands_i      ( local_operands  ),
           .is_boxed_i,
+          .rnd_mode_i      ( rnd_mode        ),
+          .op_i,
+          .op_mod_i,
+          .src_fmt_i,
+          .dst_fmt_i,
+          .tag_i,
+          .mask_i          ( simd_mask_i[lane]   ),
+          .aux_i           ( aux_data            ),
+          .in_valid_i      ( in_valid            ),
+          .in_ready_o      ( lane_in_ready[lane] ),
+          .flush_i,
+          .result_o        ( op_result           ),
+          .status_o        ( op_status           ),
+          .extension_bit_o ( lane_ext_bit[lane]  ),
+          .tag_o           ( lane_tags[lane]     ),
+          .mask_o          ( lane_masks[lane]    ),
+          .aux_o           ( lane_aux[lane]      ),
+          .out_valid_o     ( out_valid           ),
+          .out_ready_i     ( out_ready           ),
+          .busy_o          ( lane_busy[lane]     )
+        );
+      end else if (OpGroup == fpnew_pkg::DOTP) begin : lane_instance
+        fpnew_sdotp_multi_wrapper #(
+          .LaneWidth   ( LANE_WIDTH           ),
+          .FpFmtConfig ( LANE_FORMATS         ), // fp64 and fp32 not supported
+          .NumPipeRegs ( NumPipeRegs          ),
+          .PipeConfig  ( PipeConfig           ),
+          .TagType     ( TagType              ),
+          .AuxType     ( logic [AUX_BITS-1:0] ),
+          .StochasticRndImplementation ( StochasticRndImplementation )
+        ) i_fpnew_sdotp_multi_wrapper (
+          .clk_i,
+          .rst_ni,
+          .sdotp_hart_id_i ( {hart_id_i, 2'b00} + lane ),
+          .operands_i      ( local_operands[2:0] ), // 3 operands
+          .is_boxed_i,
           .rnd_mode_i,
           .op_i,
           .op_mod_i,
@@ -246,9 +314,8 @@ or set Features.FpFmtMask to support only FP32");
           .out_ready_i     ( out_ready           ),
           .busy_o          ( lane_busy[lane]     )
         );
-
       end else if (OpGroup == fpnew_pkg::DIVSQRT) begin : lane_instance
-        if (!PulpDivsqrt && LANE_FORMATS[0] && (LANE_FORMATS[1:fpnew_pkg::NUM_FP_FORMATS-1] == '0)) begin
+        if (!PulpDivsqrt && LANE_FORMATS[0] && (LANE_FORMATS[1:fpnew_pkg::NUM_FP_FORMATS-1] == '0)) begin : gen_th_32_divsqrt
           // The T-head-based DivSqrt unit is supported only in FP32-only configurations
           fpnew_divsqrt_th_32 #(
             .NumPipeRegs ( NumPipeRegs          ),
@@ -260,7 +327,7 @@ or set Features.FpFmtMask to support only FP32");
             .rst_ni,
             .operands_i      ( local_operands[1:0] ), // 2 operands
             .is_boxed_i      ( is_boxed_2op        ), // 2 operands
-            .rnd_mode_i,
+            .rnd_mode_i      ( rnd_mode            ),
             .op_i,
             .tag_i,
             .mask_i          ( simd_mask_i[lane]   ),
@@ -278,7 +345,7 @@ or set Features.FpFmtMask to support only FP32");
             .out_ready_i     ( out_ready           ),
             .busy_o          ( lane_busy[lane]     )
           );
-        end else begin
+        end else begin : gen_pulp_divsqrt
           fpnew_divsqrt_multi #(
             .FpFmtConfig ( LANE_FORMATS         ),
             .NumPipeRegs ( NumPipeRegs          ),
@@ -290,7 +357,7 @@ or set Features.FpFmtMask to support only FP32");
             .rst_ni,
             .operands_i       ( local_operands[1:0] ), // 2 operands
             .is_boxed_i       ( is_boxed_2op        ), // 2 operands
-            .rnd_mode_i,
+            .rnd_mode_i       ( rnd_mode            ),
             .op_i,
             .dst_fmt_i,
             .tag_i,
@@ -329,7 +396,7 @@ or set Features.FpFmtMask to support only FP32");
           .rst_ni,
           .operands_i      ( local_operands[0]   ),
           .is_boxed_i      ( is_boxed_1op        ),
-          .rnd_mode_i,
+          .rnd_mode_i      ( rnd_mode            ),
           .op_i,
           .op_mod_i,
           .src_fmt_i,
@@ -358,14 +425,17 @@ or set Features.FpFmtMask to support only FP32");
       assign lane_out_valid[lane] = out_valid & ((lane == 0) | result_is_vector);
 
       // Properly NaN-box or sign-extend the slice result if not in use
-      assign local_result      = lane_out_valid[lane] ? op_result : '{default: lane_ext_bit[0]};
+      assign local_result      = lane_out_valid[lane] ? op_result : {(LANE_WIDTH){lane_ext_bit[0]}};
       assign lane_status[lane] = lane_out_valid[lane] ? op_status : '0;
 
     // Otherwise generate constant sign-extension
     end else begin : inactive_lane
       assign lane_out_valid[lane] = 1'b0; // unused lane
       assign lane_in_ready[lane]  = 1'b0; // unused lane
-      assign local_result         = '{default: lane_ext_bit[0]}; // sign-extend/nan box
+      assign lane_aux[lane]       = 1'b0; // unused lane
+      assign lane_tags[lane]      = 1'b0; // unused lane
+      assign lane_ext_bit[lane]   = 1'b1; // NaN-box unused lane
+      assign local_result         = {(LANE_WIDTH){lane_ext_bit[0]}}; // sign-extend/nan box
       assign lane_status[lane]    = '0;
       assign lane_busy[lane]      = 1'b0;
     end
@@ -373,17 +443,38 @@ or set Features.FpFmtMask to support only FP32");
     // Generate result packing depending on float format
     for (genvar fmt = 0; fmt < NUM_FORMATS; fmt++) begin : pack_fp_result
       // Set up some constants
-      localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
-      // only for active formats within the lane
-      if (ACTIVE_FORMATS[fmt]) begin
-        assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
-            local_result[FP_WIDTH-1:0];
-      end else if ((LANE+1)*FP_WIDTH <= Width) begin
-        assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
-            '{default: lane_ext_bit[LANE]};
-      end else if (LANE*FP_WIDTH < Width) begin
-        assign fmt_slice_result[fmt][Width-1:LANE*FP_WIDTH] =
-            '{default: lane_ext_bit[LANE]};
+      if (OpGroup == fpnew_pkg::DOTP) begin
+        localparam int unsigned INACTIVE_MASK = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(LANE_FORMATS[fmt]));
+        localparam int unsigned FP_WIDTH      = fpnew_pkg::minimum(INACTIVE_MASK, fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt)));
+        // only for active formats within the lane
+        if (ACTIVE_FORMATS[fmt] && (LANE_WIDTH>0)) begin
+          if (FP_WIDTH==INACTIVE_MASK) begin
+            assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
+                local_result[FP_WIDTH-1:0];
+          end else begin
+            assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
+                local_result[FP_WIDTH-1:0];
+          end
+        end else if ((LANE+1)*FP_WIDTH <= Width) begin
+          assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
+              '{default: lane_ext_bit[LANE]};
+        end else if (LANE*FP_WIDTH < Width) begin
+          assign fmt_slice_result[fmt][Width-1:LANE*FP_WIDTH] =
+              '{default: lane_ext_bit[LANE]};
+        end
+      end else begin
+        localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+        // only for active formats within the lane
+        if (ACTIVE_FORMATS[fmt]) begin
+          assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
+              local_result[FP_WIDTH-1:0];
+        end else if ((LANE+1)*FP_WIDTH <= Width) begin
+          assign fmt_slice_result[fmt][(LANE+1)*FP_WIDTH-1:LANE*FP_WIDTH] =
+              '{default: lane_ext_bit[LANE]};
+        end else if (LANE*FP_WIDTH < Width) begin
+          assign fmt_slice_result[fmt][Width-1:LANE*FP_WIDTH] =
+              '{default: lane_ext_bit[LANE]};
+        end
       end
     end
 
@@ -423,7 +514,7 @@ or set Features.FpFmtMask to support only FP32");
   if (OpGroup == fpnew_pkg::CONV) begin : target_regs
     // Bypass pipeline signals, index i holds signal after i register stages
     logic [0:NumPipeRegs][Width-1:0] byp_pipe_target_q;
-    logic [0:NumPipeRegs][2:0]       byp_pipe_aux_q;
+    logic [0:NumPipeRegs][1:0]       byp_pipe_aux_q;
     logic [0:NumPipeRegs]            byp_pipe_valid_q;
     // Ready signal is combinatorial for all stages
     logic [0:NumPipeRegs] byp_pipe_ready;
@@ -454,9 +545,30 @@ or set Features.FpFmtMask to support only FP32");
     assign conv_target_q = byp_pipe_target_q[NumPipeRegs];
 
     // decode the aux data
-    assign {result_vec_op, result_is_cpk} = byp_pipe_aux_q[NumPipeRegs];
+    assign result_vec_op = byp_pipe_aux_q[NumPipeRegs];
+
+    for (genvar fmt = 0; fmt < NUM_FORMATS; fmt++) begin : pack_conv_cpk_result
+      localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+
+      for (genvar op_idx = 0; op_idx < 4; op_idx++) begin : pack_conv_cpk_result_operands
+        localparam int unsigned UPPER_LEFT  = 2*(op_idx+1)*FP_WIDTH;
+        localparam int unsigned LOWER_LEFT  = 2*op_idx*FP_WIDTH;
+        localparam int unsigned UPPER_RIGHT = 2*FP_WIDTH;
+
+        if(UPPER_LEFT <= Width) begin
+          always_comb begin : pack_conv_cpk
+            fmt_conv_cpk_result[fmt][op_idx] = conv_target_q; // rd pre-load
+            fmt_conv_cpk_result[fmt][op_idx][UPPER_LEFT-1:LOWER_LEFT] = fmt_slice_result[fmt][UPPER_RIGHT-1:0*FP_WIDTH]; // vfcpk
+          end
+        end else begin
+          assign fmt_conv_cpk_result[fmt][op_idx] = '0;
+        end
+      end
+    end
+
   end else begin : no_conv
-    assign {result_vec_op, result_is_cpk} = '0;
+    assign result_vec_op = '0;
+    assign fmt_conv_cpk_result = '0;
   end
 
   if (PulpDivsqrt) begin
@@ -472,11 +584,12 @@ or set Features.FpFmtMask to support only FP32");
   // ------------
   // Output Side
   // ------------
-  assign {result_fmt_is_int, result_is_vector, result_fmt} = lane_aux[0];
+  assign {result_is_cpk, result_fmt_is_int, result_is_vector, result_fmt, result_is_vsum} = lane_aux[0];
 
-  assign result_o = result_fmt_is_int
-                    ? ifmt_slice_result[result_fmt]
-                    : fmt_slice_result[result_fmt];
+  assign result_o = result_fmt_is_int ? ifmt_slice_result[result_fmt]                   :
+                    result_is_cpk     ? fmt_conv_cpk_result[result_fmt][result_vec_op]  :
+                    (result_is_vsum  && (Width == 64)) ? {{(Width/2){1'b1}}, {fmt_slice_result[result_fmt][Width/2-1:0]}} :
+                                        fmt_slice_result[result_fmt];
 
   assign extension_bit_o = lane_ext_bit[0]; // don't care about upper ones
   assign tag_o           = lane_tags[0];    // don't care about upper ones
diff --git a/src/fpnew_pkg.sv b/src/fpnew_pkg.sv
index 7addc3e9..4021deeb 100644
--- a/src/fpnew_pkg.sv
+++ b/src/fpnew_pkg.sv
@@ -25,6 +25,7 @@ package fpnew_pkg;
   // | FP16       | IEEE binary16    | 16 bit | 5        | 10
   // | FP8        | binary8          |  8 bit | 5        | 2
   // | FP16ALT    | binary16alt      | 16 bit | 8        | 7
+  // | FP8ALT     | binary8alt       |  8 bit | 4        | 3
   // *NOTE:* Add new formats only at the end of the enumeration for backwards compatibilty!
 
   // Encoding for a format
@@ -33,7 +34,7 @@ package fpnew_pkg;
     int unsigned man_bits;
   } fp_encoding_t;
 
-  localparam int unsigned NUM_FP_FORMATS = 5; // change me to add formats
+  localparam int unsigned NUM_FP_FORMATS = 6; // change me to add formats
   localparam int unsigned FP_FORMAT_BITS = $clog2(NUM_FP_FORMATS);
 
   // FP formats
@@ -42,7 +43,8 @@ package fpnew_pkg;
     FP64    = 'd1,
     FP16    = 'd2,
     FP8     = 'd3,
-    FP16ALT = 'd4
+    FP16ALT = 'd4,
+    FP8ALT  = 'd5
     // add new formats here
   } fp_format_e;
 
@@ -52,14 +54,18 @@ package fpnew_pkg;
     '{11, 52}, // IEEE binary64 (double)
     '{5,  10}, // IEEE binary16 (half)
     '{5,  2},  // custom binary8
-    '{8,  7}   // custom binary16alt
+    '{8,  7},  // custom binary16alt
+    '{4,  3}   // custom binary8alt
     // add new formats here
   };
 
   typedef logic [0:NUM_FP_FORMATS-1]       fmt_logic_t;    // Logic indexed by FP format (for masks)
   typedef logic [0:NUM_FP_FORMATS-1][31:0] fmt_unsigned_t; // Unsigned indexed by FP format
 
-  localparam fmt_logic_t CPK_FORMATS = 5'b11000; // FP32 and FP64 can provide CPK only
+  localparam fmt_logic_t CPK_FORMATS  = 6'b110000; // FP32 and FP64 can provide CPK only
+  // FP32, FP64 cannot be provided for DOTP
+  // Small hack: FP32 only enabled for wide enough wrapper input widths for vsum.s instruction
+  localparam fmt_logic_t DOTP_FORMATS = 6'b101111;
 
   // ---------
   // INT TYPES
@@ -107,16 +113,17 @@ package fpnew_pkg;
   // --------------
   // FP OPERATIONS
   // --------------
-  localparam int unsigned NUM_OPGROUPS = 4;
+  localparam int unsigned NUM_OPGROUPS = 5;
 
   // Each FP operation belongs to an operation group
-  typedef enum logic [1:0] {
-    ADDMUL, DIVSQRT, NONCOMP, CONV
+  typedef enum logic [2:0] {
+    ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP
   } opgroup_e;
 
-  localparam int unsigned OP_BITS = 4;
+  localparam int unsigned OP_BITS = 5;
 
   typedef enum logic [OP_BITS-1:0] {
+    SDOTP, EXVSUM, VSUM,         // DOTP operation group
     FMADD, FNMSUB, ADD, MUL,     // ADDMUL operation group
     DIV, SQRT,                   // DIVSQRT operation group
     SGNJ, MINMAX, CMP, CLASSIFY, // NONCOMP operation group
@@ -134,6 +141,7 @@ package fpnew_pkg;
     RUP = 3'b011,
     RMM = 3'b100,
     ROD = 3'b101,  // This mode is not defined in RISC-V FP-SPEC
+    RSR = 3'b110,  // This mode is not defined in RISC-V FP-SPEC
     DYN = 3'b111
   } roundmode_e;
 
@@ -146,6 +154,12 @@ package fpnew_pkg;
     logic NX; // Inexact
   } status_t;
 
+  // CSR encoded alternate fp formats
+  typedef struct packed {
+    logic src; // Source format selection
+    logic dst; // Destination format selection
+  } fmt_mode_t;
+
   // Information about a floating point value
   typedef struct packed {
     logic is_normal;     // is the value normal
@@ -211,7 +225,7 @@ package fpnew_pkg;
     Width:         64,
     EnableVectors: 1'b0,
     EnableNanBox:  1'b1,
-    FpFmtMask:     5'b11000,
+    FpFmtMask:     6'b110000,
     IntFmtMask:    4'b0011
   };
 
@@ -219,7 +233,7 @@ package fpnew_pkg;
     Width:         64,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     5'b11000,
+    FpFmtMask:     6'b110000,
     IntFmtMask:    4'b0010
   };
 
@@ -227,7 +241,7 @@ package fpnew_pkg;
     Width:         32,
     EnableVectors: 1'b0,
     EnableNanBox:  1'b1,
-    FpFmtMask:     5'b10000,
+    FpFmtMask:     6'b100000,
     IntFmtMask:    4'b0010
   };
 
@@ -235,7 +249,7 @@ package fpnew_pkg;
     Width:         64,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     5'b11111,
+    FpFmtMask:     6'b111111,
     IntFmtMask:    4'b1111
   };
 
@@ -243,7 +257,7 @@ package fpnew_pkg;
     Width:         32,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     5'b10111,
+    FpFmtMask:     6'b101111,
     IntFmtMask:    4'b1110
   };
 
@@ -251,7 +265,7 @@ package fpnew_pkg;
     Width:         32,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     5'b10001,
+    FpFmtMask:     6'b100010,
     IntFmtMask:    4'b0110
   };
 
@@ -268,7 +282,8 @@ package fpnew_pkg;
     UnitTypes:  '{'{default: PARALLEL}, // ADDMUL
                   '{default: MERGED},   // DIVSQRT
                   '{default: PARALLEL}, // NONCOMP
-                  '{default: MERGED}},  // CONV
+                  '{default: MERGED},   // CONV
+                  '{default: DISABLED}},  // DOTP
     PipeConfig: BEFORE
   };
 
@@ -277,10 +292,30 @@ package fpnew_pkg;
     UnitTypes:  '{'{default: PARALLEL}, // ADDMUL
                   '{default: DISABLED}, // DIVSQRT
                   '{default: PARALLEL}, // NONCOMP
-                  '{default: MERGED}},  // CONV
+                  '{default: MERGED},   // CONV
+                  '{default: MERGED}},  // DOTP
     PipeConfig: BEFORE
   };
 
+  // Stochastic rounding only supported by DOTP operation group block
+  typedef struct packed {
+    logic        EnableRSR;             // Enable RSR adding an LFSR in the SDOTP rounding modules
+    int unsigned RsrPrecision;          // Number of bits considered for the stochastic rounding decision
+    int unsigned LfsrInternalPrecision; // LFSR internal bitwidth setting the pseudorandom number periodicity
+  } rsr_impl_t;
+
+  localparam rsr_impl_t DEFAULT_NO_RSR = '{
+    EnableRSR:           1'b0,
+    RsrPrecision:          12,
+    LfsrInternalPrecision: 32
+  };
+
+  localparam rsr_impl_t DEFAULT_RSR = '{
+    EnableRSR:           1'b1,
+    RsrPrecision:          12,
+    LfsrInternalPrecision: 32
+  };
+
   // -----------------------
   // Synthesis optimization
   // -----------------------
@@ -314,6 +349,15 @@ package fpnew_pkg;
     return res;
   endfunction
 
+
+  function automatic int unsigned max_dotp_dst_fp_width(fmt_logic_t cfg);
+    automatic int unsigned res = 0;
+    for (int unsigned i = 0; i < NUM_FP_FORMATS; i++)
+      if (cfg[i])
+        res = unsigned'(maximum(res, fp_format_e'(i)));
+    return res;
+  endfunction
+
   // Returns the narrowest FP format present
   function automatic int unsigned min_fp_width(fmt_logic_t cfg);
     automatic int unsigned res = max_fp_width(cfg);
@@ -371,6 +415,7 @@ package fpnew_pkg;
       DIV, SQRT:                   return DIVSQRT;
       SGNJ, MINMAX, CMP, CLASSIFY: return NONCOMP;
       F2F, F2I, I2F, CPKAB, CPKCD: return CONV;
+      SDOTP, EXVSUM, VSUM:         return DOTP;
       default:                     return NONCOMP;
     endcase
   endfunction
@@ -382,6 +427,7 @@ package fpnew_pkg;
       DIVSQRT: return 2;
       NONCOMP: return 2;
       CONV:    return 3; // vectorial casts use 3 operands
+      DOTP:    return 3; // splitting into 5 operands done in wrapper
       default: return 0;
     endcase
   endfunction
@@ -437,6 +483,41 @@ package fpnew_pkg;
     return res;
   endfunction
 
+  //Returns how many DOTP lanes should be generated
+  function automatic int num_dotp_lanes(int unsigned width,
+                                        fmt_logic_t cfg);
+    return (cfg[FP16] || cfg[FP16ALT]) && (cfg[FP32] || cfg[FP8] || cfg[FP8ALT]) ?
+               (width / (2*min_fp_width(cfg))) : 0;
+  endfunction
+
+  // Returns a mask of active FP formats that are currenlty supported for DOTP operations
+  function automatic fmt_logic_t get_dotp_lane_formats(int unsigned width,
+                                                       fmt_logic_t cfg,
+                                                       int unsigned lane_no);
+    automatic fmt_logic_t res;
+    automatic fmt_logic_t mask;
+    int unsigned nr_16to32bit_lanes = (cfg[FP32]) ? (width / 32) : 0;
+    if (lane_no < nr_16to32bit_lanes)
+      mask = 6'b101111;  //lane should be 16-bit -> 32-bit
+    else
+      mask = 6'b001111;  //lane should be  8-bit -> 16-bit
+    res = cfg & mask;
+    return res;
+  endfunction
+
+  // Returns the dotp dest FP format string
+  function automatic fmt_logic_t get_dotp_dst_fmts(fmt_logic_t cfg, fmt_logic_t src_cfg);
+    automatic fmt_logic_t res;
+    res = { cfg[FP32] && (src_cfg[FP16] || src_cfg[FP16ALT] || src_cfg[FP8] || src_cfg[FP8ALT]),
+            1'b0,                                               // FP64 not supported as dstFmt
+            cfg[FP16] && (src_cfg[FP8] || src_cfg[FP8ALT]),
+            cfg[FP8],                                           // FP8 supported as dstFmt for VSUM
+            cfg[FP16ALT] && (src_cfg[FP8] || src_cfg[FP8ALT]),
+            cfg[FP8ALT]                                         // FP8ALT supported as dstFmt for VSUM
+    };
+    return res;
+  endfunction
+
   // Returns a mask of active INT formats that are present in lane lane_no of a CONV slice
   function automatic ifmt_logic_t get_conv_lane_int_formats(int unsigned width,
                                                             fmt_logic_t cfg,
diff --git a/src/fpnew_rounding.sv b/src/fpnew_rounding.sv
index 4e677209..bb6e868f 100644
--- a/src/fpnew_rounding.sv
+++ b/src/fpnew_rounding.sv
@@ -14,13 +14,24 @@
 // Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
 module fpnew_rounding #(
-  parameter int unsigned AbsWidth=2 // Width of the abolute value, without sign bit
+  parameter int unsigned          AbsWidth      = 2, // Width of the abolute value, without sign bit
+  parameter logic                 EnableRSR     = 0,
+  parameter int unsigned          RsrPrecision  = 12,
+  //LFSR patameters
+  parameter int unsigned          LfsrWidth     = 32,
+  parameter logic [LfsrWidth-1:0] RstVal        = '1
 ) (
+  // LFSR inputs
+  input logic                  clk_i,
+  input logic                  rst_ni,
+  input logic [33:0]           id_i,
   // Input value
   input logic [AbsWidth-1:0]   abs_value_i,             // absolute value without sign
   input logic                  sign_i,
+  input logic                  en_rsr_i,
   // Rounding information
   input logic [1:0]            round_sticky_bits_i,     // round and sticky bits {RS}
+  input logic [RsrPrecision-1:0] stochastic_rounding_bits_i,
   input fpnew_pkg::roundmode_e rnd_mode_i,
   input logic                  effective_subtraction_i, // sign of inputs affects rounding of zeroes
   // Output value
@@ -32,7 +43,7 @@ module fpnew_rounding #(
 
   logic round_up; // Rounding decision
 
-  // Take the rounding decision according to RISC-V spec
+  // Take the rounding decision according to RISC-V spec, plus additional unbiased rounding modes
   // RoundMode | Mnemonic | Meaning
   // :--------:|:--------:|:-------
   //    000    |   RNE    | Round to Nearest, ties to Even
@@ -41,10 +52,34 @@ module fpnew_rounding #(
   //    011    |   RUP    | Round Up (towards \infty)
   //    100    |   RMM    | Round to Nearest, ties to Max Magnitude
   //    101    |   ROD    | Round towards odd (this mode is not define in RISC-V FP-SPEC)
+  //    110    |   RSR    | Round by Stochastic Rounding
   //  others   |          | *invalid*
+
+  // LFSR generating random numbers for RSR mode
+  logic [RsrPrecision-1:0] lfsr_out;
+
+  if (EnableRSR) begin : gen_lfsr
+    lfsr_sr #(
+      .LfsrWidth       ( LfsrWidth           ),
+      .OutWidth        ( RsrPrecision        ),
+      .RstVal          ( RstVal              ),
+      .CipherLayers    ( 0                   ),
+      .CipherReg       ( 0                   )
+    ) i_lfsr (
+      .clk_i           ( clk_i               ),
+      .rst_ni          ( rst_ni              ),
+      .id_i            ( id_i                ),
+      .en_i            ( en_rsr_i            ),
+      .out_o           ( lfsr_out            )
+    );
+  end else begin
+    assign lfsr_out = '0;
+  end
+
+  // Rounding results by stochastic rounding
   always_comb begin : rounding_decision
     unique case (rnd_mode_i)
-      fpnew_pkg::RNE: // Decide accoring to round/sticky bits
+      fpnew_pkg::RNE: // Decide according to round/sticky bits
         unique case (round_sticky_bits_i)
           2'b00,
           2'b01: round_up = 1'b0;           // < ulp/2 away, round down
@@ -57,6 +92,14 @@ module fpnew_rounding #(
       fpnew_pkg::RUP: round_up = (| round_sticky_bits_i) ? ~sign_i : 1'b0; // to 0 if -, away if +
       fpnew_pkg::RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
       fpnew_pkg::ROD: round_up = ~abs_value_i[0] & (| round_sticky_bits_i);
+      // Decide stochastically, comparing trailing bits and pseudo-random number
+      fpnew_pkg::RSR: begin
+        if (EnableRSR) begin
+          round_up = (lfsr_out < stochastic_rounding_bits_i) ? 1'b1 : 1'b0;
+        end else begin
+          round_up = fpnew_pkg::DONT_CARE;
+        end
+      end
       default: round_up = fpnew_pkg::DONT_CARE; // propagate x
     endcase
   end
diff --git a/src/fpnew_sdotp_multi.sv b/src/fpnew_sdotp_multi.sv
new file mode 100644
index 00000000..2d4bc675
--- /dev/null
+++ b/src/fpnew_sdotp_multi.sv
@@ -0,0 +1,1444 @@
+// Copyright 2019-2021 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors: Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+//          Stefan Mach <smach@iis.ee.ethz.ch>
+//          Gianna Paulin <pauling@iis.ee.ethz.ch>
+
+// This unit can be used to compute the following operations:
+// - EXSDOTP: expanding dot product with accumulation
+//             (op_a * op_b) + (op_c * op_d) + op_e
+//             where op_e and the result are expressed with twice as many bits as op_a, op_b, op_c, op_d
+// - EXVSUM: expanding vector inner sum
+//             (op_a + op_c + op_e)
+//             where op_e and the result are expressed with twice as many bits as op_a, op_c
+//             EXVSUM is computed setting op_b and op_d to 1
+// - VSUM:   non-expanding vector inner sum
+//             (op_a + op_c + op_e)
+//             where op_e and the result are expressed with as many bits as op_a, op_c
+//             The bit-width can be as large as the maximum allowed destination width
+//             VSUM is computed by-passing the two multiplications, thus neglecting op_b and op_d
+
+// All the supported operations require a three-term addend (X + Y + Z). The unit first computes
+// W = X + Y and then result = W + Z, where X is the maximum addend, Y is the intermediate addend
+// and Z is the minimum addend.
+
+// The unit requires two one-hot config strings to select the allowed input and output formats.
+// The maximum output format should be twice as large as the maximum input format (for non-expanding
+// VSUM the maximum input format is set by the maximum output format (op_a and op_c are as large
+// as the accumulator and the result), then the input format is selected at run-time by the signal
+// src_fmt_i.
+
+`include "common_cells/registers.svh"
+
+module fpnew_sdotp_multi #(
+  // One-hot config string: | FP32 | FP64 | FP16 | FP8 | FP16ALT | FP8ALT |
+  parameter fpnew_pkg::fmt_logic_t   SrcDotpFpFmtConfig = '1, // FP32 and wider formats are not allowed
+                                                              // Supported source formats (FP8, FP8ALT, FP16, FP16ALT)
+  parameter fpnew_pkg::fmt_logic_t   DstDotpFpFmtConfig = '1, // FP8 and FP8alt are not supported
+                                                              // Supported destination formats (FP16, FP16ALTt, FP32)
+  parameter int unsigned             NumPipeRegs = 0,
+  parameter fpnew_pkg::pipe_config_t PipeConfig  = fpnew_pkg::BEFORE,
+  parameter type                     TagType     = logic,
+  parameter type                     AuxType     = logic,
+  parameter fpnew_pkg::rsr_impl_t    StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
+// Do not change
+  localparam int unsigned SRC_WIDTH = fpnew_pkg::max_fp_width(SrcDotpFpFmtConfig),
+  localparam int unsigned DST_WIDTH = fpnew_pkg::max_fp_width(DstDotpFpFmtConfig), // must be 2*SRC_WIDTH (expanding SDOTP)
+  localparam int unsigned NUM_FORMATS = fpnew_pkg::NUM_FP_FORMATS
+) (
+  input  logic                        clk_i,
+  input  logic                        rst_ni,
+  input  logic [33:0]                 sdotp_hart_id_i,
+  // Input signals
+  // op_a and op_c will contain useful bits in [SRC_WIDTH-1:0] for EXSDOTP, EXVSUM
+  // op_a and op_c will contain useful bits in [DST_WIDTH-1:0] for VSUM (non-expanding)
+  // op_b and op_d are neglected for non-expanding VSUM
+  input  logic [DST_WIDTH-1:0]        operand_a_i,
+  input  logic [SRC_WIDTH-1:0]        operand_b_i,
+  input  logic [DST_WIDTH-1:0]        operand_c_i,
+  input  logic [SRC_WIDTH-1:0]        operand_d_i,
+  input  logic [DST_WIDTH-1:0]        dst_operands_i, // accumulator
+  input  logic [NUM_FORMATS-1:0][4:0] is_boxed_i,     // 5 operands
+  input  fpnew_pkg::roundmode_e       rnd_mode_i,
+  input  fpnew_pkg::operation_e       op_i,
+  input  logic                        op_mod_i,
+  input  fpnew_pkg::fp_format_e       src_fmt_i, // format of op_a, op_b, op_c, op_d
+  input  fpnew_pkg::fp_format_e       dst_fmt_i, // format of the accumulator (op_e) and result
+  input  TagType                      tag_i,
+  input  logic                        mask_i,
+  input  AuxType                      aux_i,
+  // Input Handshake
+  input  logic                        in_valid_i,
+  output logic                        in_ready_o,
+  input  logic                        flush_i,
+  // Output signals
+  output logic [DST_WIDTH-1:0]        result_o,
+  output fpnew_pkg::status_t          status_o,
+  output logic                        extension_bit_o,
+  output TagType                      tag_o,
+  output logic                        mask_o,
+  output AuxType                      aux_o,
+  // Output handshake
+  output logic                        out_valid_o,
+  input  logic                        out_ready_i,
+  // Indication of valid data in flight
+  output logic                        busy_o
+);
+
+  // ----------
+  // Constants
+  // ----------
+  // The super-format that can hold all formats
+  localparam fpnew_pkg::fp_encoding_t SUPER_FORMAT = fpnew_pkg::super_format(SrcDotpFpFmtConfig);
+  localparam fpnew_pkg::fp_encoding_t SUPER_DST_FORMAT = fpnew_pkg::super_format(DstDotpFpFmtConfig);
+
+  localparam int unsigned SUPER_EXP_BITS = SUPER_FORMAT.exp_bits;
+  localparam int unsigned SUPER_MAN_BITS = SUPER_FORMAT.man_bits;
+  localparam int unsigned SUPER_DST_EXP_BITS = SUPER_DST_FORMAT.exp_bits;
+  localparam int unsigned SUPER_DST_MAN_BITS = fpnew_pkg::maximum(SUPER_DST_FORMAT.man_bits, 2*SUPER_MAN_BITS + 1);
+
+  // Precision bits 'p' include the implicit bit
+  localparam int unsigned PRECISION_BITS = SUPER_MAN_BITS + 1;
+  // Destination precision bits 'p_dst' include the implicit bit
+  localparam int unsigned DST_PRECISION_BITS = SUPER_DST_MAN_BITS + 1;
+  localparam int unsigned ADDITIONAL_PRECISION_BITS = fpnew_pkg::maximum(DST_PRECISION_BITS - 2 * PRECISION_BITS, 0);
+  // Stochastic rounding implementation
+  localparam logic        ENABLE_RSR         = StochasticRndImplementation.EnableRSR;
+  localparam int unsigned RSR_PRECISION_BITS = StochasticRndImplementation.RsrPrecision;
+  localparam int unsigned LFSR_WIDTH         = StochasticRndImplementation.LfsrInternalPrecision;
+  // The leading-zero counter operates on LZC_SUM_WIDTH bits
+  localparam int unsigned LZC_SUM_WIDTH  = 2*DST_PRECISION_BITS + PRECISION_BITS + 5;
+  localparam int unsigned LZC_RESULT_WIDTH = $clog2(LZC_SUM_WIDTH);
+
+  // Internal exponent width must accomodate all meaningful exponent values in order to avoid
+  // datapath leakage. This is either given by the exponent bits or the width of the LZC result.
+  localparam int unsigned EXP_WIDTH = unsigned'(fpnew_pkg::maximum(SUPER_EXP_BITS + 2, LZC_RESULT_WIDTH));
+  localparam int unsigned DST_EXP_WIDTH = unsigned'(fpnew_pkg::maximum(SUPER_DST_EXP_BITS + 2, LZC_RESULT_WIDTH));
+  // Shift amount width: maximum internal mantissa size is 2*DST_PRECISION_BITS+3 bits
+  localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(2*DST_PRECISION_BITS+PRECISION_BITS+4);
+  localparam int unsigned DST_SHIFT_AMOUNT_WIDTH = $clog2(2*DST_PRECISION_BITS+PRECISION_BITS+5);
+  // Pipelines
+  localparam NUM_INP_REGS = PipeConfig == fpnew_pkg::BEFORE
+                            ? NumPipeRegs
+                            : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                               ? ((NumPipeRegs + 1) / 3) // Second to get distributed regs
+                               : 0); // no regs here otherwise
+  localparam NUM_MID_REGS = PipeConfig == fpnew_pkg::INSIDE
+                          ? NumPipeRegs
+                          : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                             ? ((NumPipeRegs + 2) / 3) // First to get distributed regs
+                             : 0); // no regs here otherwise
+  localparam NUM_OUT_REGS = PipeConfig == fpnew_pkg::AFTER
+                            ? NumPipeRegs
+                            : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                               ? (NumPipeRegs / 3) // Last to get distributed regs
+                               : 0); // no regs here otherwise
+
+  // ----------------
+  // Type definition
+  // ----------------
+  typedef struct packed {
+    logic                      sign;
+    logic [SUPER_EXP_BITS-1:0] exponent;
+    logic [SUPER_MAN_BITS-1:0] mantissa;
+  } fp_src_t;
+  typedef struct packed {
+    logic                          sign;
+    logic [SUPER_DST_EXP_BITS-1:0] exponent;
+    logic [SUPER_DST_MAN_BITS-1:0] mantissa;
+  } fp_dst_t;
+
+  // ---------------
+  // Input pipeline
+  // ---------------
+  // Selected pipeline output signals as non-arrays
+  logic [DST_WIDTH-1:0]  operand_a_q;
+  logic [SRC_WIDTH-1:0]  operand_b_q;
+  logic [DST_WIDTH-1:0]  operand_c_q;
+  logic [SRC_WIDTH-1:0]  operand_d_q;
+  logic [DST_WIDTH-1:0]  dst_operands_q;
+  fpnew_pkg::fp_format_e src_fmt_q;
+  fpnew_pkg::fp_format_e dst_fmt_q;
+
+  // Input pipeline signals, index i holds signal after i register stages
+  logic                  [0:NUM_INP_REGS][DST_WIDTH-1:0]        inp_pipe_operand_a_q;
+  logic                  [0:NUM_INP_REGS][SRC_WIDTH-1:0]        inp_pipe_operand_b_q;
+  logic                  [0:NUM_INP_REGS][DST_WIDTH-1:0]        inp_pipe_operand_c_q;
+  logic                  [0:NUM_INP_REGS][SRC_WIDTH-1:0]        inp_pipe_operand_d_q;
+  logic                  [0:NUM_INP_REGS][DST_WIDTH-1:0]        inp_pipe_dst_operands_q;
+  logic                  [0:NUM_INP_REGS][NUM_FORMATS-1:0][4:0] inp_pipe_is_boxed_q;
+  fpnew_pkg::roundmode_e [0:NUM_INP_REGS]                       inp_pipe_rnd_mode_q;
+  fpnew_pkg::operation_e [0:NUM_INP_REGS]                       inp_pipe_op_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_op_mod_q;
+  fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_src_fmt_q;
+  fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_dst_fmt_q;
+  TagType                [0:NUM_INP_REGS]                       inp_pipe_tag_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_mask_q;
+  AuxType                [0:NUM_INP_REGS]                       inp_pipe_aux_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_INP_REGS] inp_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from inputs
+  assign inp_pipe_operand_a_q[0]    = operand_a_i;
+  assign inp_pipe_operand_b_q[0]    = operand_b_i;
+  assign inp_pipe_operand_c_q[0]    = operand_c_i;
+  assign inp_pipe_operand_d_q[0]    = operand_d_i;
+  assign inp_pipe_dst_operands_q[0] = dst_operands_i;
+  assign inp_pipe_is_boxed_q[0]     = is_boxed_i;
+  assign inp_pipe_rnd_mode_q[0]     = rnd_mode_i;
+  assign inp_pipe_op_q[0]           = op_i;
+  assign inp_pipe_op_mod_q[0]       = op_mod_i;
+  assign inp_pipe_src_fmt_q[0]      = src_fmt_i;
+  assign inp_pipe_dst_fmt_q[0]      = dst_fmt_i;
+  assign inp_pipe_tag_q[0]          = tag_i;
+  assign inp_pipe_mask_q[0]         = mask_i;
+  assign inp_pipe_aux_q[0]          = aux_i;
+  assign inp_pipe_valid_q[0]        = in_valid_i;
+  // Input stage: Propagate pipeline ready signal to updtream circuitry
+  assign in_ready_o = inp_pipe_ready[0];
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_INP_REGS; i++) begin : gen_input_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign inp_pipe_ready[i] = inp_pipe_ready[i+1] | ~inp_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(inp_pipe_valid_q[i+1], inp_pipe_valid_q[i], inp_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipleine ready and a valid data item is present
+    assign reg_ena = inp_pipe_ready[i] & inp_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(inp_pipe_operand_a_q[i+1],    inp_pipe_operand_a_q[i],    reg_ena, '0)
+    `FFL(inp_pipe_operand_b_q[i+1],    inp_pipe_operand_b_q[i],    reg_ena, '0)
+    `FFL(inp_pipe_operand_c_q[i+1],    inp_pipe_operand_c_q[i],    reg_ena, '0)
+    `FFL(inp_pipe_operand_d_q[i+1],    inp_pipe_operand_d_q[i],    reg_ena, '0)
+    `FFL(inp_pipe_dst_operands_q[i+1], inp_pipe_dst_operands_q[i], reg_ena, '0)
+    `FFL(inp_pipe_is_boxed_q[i+1],     inp_pipe_is_boxed_q[i],     reg_ena, '0)
+    `FFL(inp_pipe_rnd_mode_q[i+1],     inp_pipe_rnd_mode_q[i],     reg_ena, fpnew_pkg::RNE)
+    `FFL(inp_pipe_op_q[i+1],           inp_pipe_op_q[i],           reg_ena, fpnew_pkg::SDOTP)
+    `FFL(inp_pipe_op_mod_q[i+1],       inp_pipe_op_mod_q[i],       reg_ena, '0)
+    `FFL(inp_pipe_src_fmt_q[i+1],      inp_pipe_src_fmt_q[i],      reg_ena, fpnew_pkg::FP8)
+    `FFL(inp_pipe_dst_fmt_q[i+1],      inp_pipe_dst_fmt_q[i],      reg_ena, fpnew_pkg::FP16)
+    `FFL(inp_pipe_tag_q[i+1],          inp_pipe_tag_q[i],          reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],         inp_pipe_mask_q[i],         reg_ena, '0)
+    `FFL(inp_pipe_aux_q[i+1],          inp_pipe_aux_q[i],          reg_ena, AuxType'('0))
+  end
+  // Output stage: assign selected pipe outputs to signals for later use
+  assign operand_a_q    = inp_pipe_operand_a_q[NUM_INP_REGS];
+  assign operand_b_q    = inp_pipe_operand_b_q[NUM_INP_REGS];
+  assign operand_c_q    = inp_pipe_operand_c_q[NUM_INP_REGS];
+  assign operand_d_q    = inp_pipe_operand_d_q[NUM_INP_REGS];
+  assign dst_operands_q = inp_pipe_dst_operands_q[NUM_INP_REGS];
+  assign src_fmt_q      = inp_pipe_src_fmt_q[NUM_INP_REGS];
+  assign dst_fmt_q      = inp_pipe_dst_fmt_q[NUM_INP_REGS];
+
+  logic [3:0][SRC_WIDTH-1:0] operands_post_inp_pipe;
+  // vivado fix: loop is here to make it work on vivado
+  for (genvar i = 0; i < SRC_WIDTH; i++) begin : gen_op_assign
+    assign operands_post_inp_pipe[3][i] = operand_d_q[i];
+    assign operands_post_inp_pipe[2][i] = operand_c_q[i];
+    assign operands_post_inp_pipe[1][i] = operand_b_q[i];
+    assign operands_post_inp_pipe[0][i] = operand_a_q[i];
+  end
+
+  // -----------------
+  // Input processing
+  // -----------------
+
+  // -----------------
+  // Source operands
+  // -----------------
+  logic        [NUM_FORMATS-1:0][3:0]                     fmt_sign;
+  logic signed [NUM_FORMATS-1:0][3:0][SUPER_EXP_BITS-1:0] fmt_exponent;
+  logic        [NUM_FORMATS-1:0][3:0][SUPER_MAN_BITS-1:0] fmt_mantissa;
+
+  fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][4:0] info_q;
+  fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][1:0] info_vsum_q;
+
+  // FP Input initialization (Src)
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_src_init_inputs
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (SrcDotpFpFmtConfig[fmt]) begin : active_src_format
+      logic [3:0][FP_WIDTH-1:0] trimmed_ops;
+
+      // Classify input
+      fpnew_classifier #(
+        .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+        .NumOperands ( 4                            )
+      ) i_fpnew_classifier (
+        .operands_i  ( trimmed_ops                                 ),
+        .is_boxed_i  ( inp_pipe_is_boxed_q[NUM_INP_REGS][fmt][3:0] ),
+        .info_o      ( info_q[fmt][3:0]                            )
+      );
+      for (genvar op = 0; op < 4; op++) begin : gen_operands
+        assign trimmed_ops[op]       = operands_post_inp_pipe[op][FP_WIDTH-1:0];
+        assign fmt_sign[fmt][op]     = operands_post_inp_pipe[op][FP_WIDTH-1];
+        assign fmt_exponent[fmt][op] = signed'({1'b0, operands_post_inp_pipe[op][MAN_BITS+:EXP_BITS]});
+        assign fmt_mantissa[fmt][op] = {info_q[fmt][op].is_normal, operands_post_inp_pipe[op][MAN_BITS-1:0]} <<
+                                       (SUPER_MAN_BITS - MAN_BITS); // move to left of mantissa
+      end
+    end else begin : inactive_src_format
+      assign info_q[fmt][3:0]  = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_sign[fmt]     = fpnew_pkg::DONT_CARE;             // format disabled
+      assign fmt_exponent[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_mantissa[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+    end
+  end
+
+  // ----------------------------
+  // Non-expanding VSUM operands
+  // ----------------------------
+  logic        [NUM_FORMATS-1:0][1:0]                         fmt_vsum_sign;
+  logic signed [NUM_FORMATS-1:0][1:0][SUPER_DST_EXP_BITS-1:0] fmt_vsum_exponent;
+  logic        [NUM_FORMATS-1:0][1:0][SUPER_DST_MAN_BITS-1:0] fmt_vsum_mantissa;
+
+  // FP Input initialization (Src)
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_vsum_init_inputs
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (DstDotpFpFmtConfig[fmt]) begin : active_vsum_format
+      logic [1:0][FP_WIDTH-1:0] trimmed_vsum_ops;
+      logic [1:0]               vsum_ops_is_boxed;
+
+      assign vsum_ops_is_boxed = {inp_pipe_is_boxed_q[NUM_INP_REGS][fmt][2],
+                                  inp_pipe_is_boxed_q[NUM_INP_REGS][fmt][0]};
+
+      // Classify input
+      fpnew_classifier #(
+        .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+        .NumOperands ( 2                            )
+      ) i_fpnew_classifier (
+        .operands_i  ( trimmed_vsum_ops  ),
+        .is_boxed_i  ( vsum_ops_is_boxed ),
+        .info_o      ( info_vsum_q[fmt]  )
+      );
+      assign trimmed_vsum_ops          = {operand_c_q[FP_WIDTH-1:0], operand_a_q[FP_WIDTH-1:0]};
+      assign fmt_vsum_sign[fmt]        = {operand_c_q[FP_WIDTH-1], operand_a_q[FP_WIDTH-1]};
+      assign fmt_vsum_exponent[fmt][1] = signed'({1'b0, operand_c_q[MAN_BITS+:EXP_BITS]});
+      assign fmt_vsum_exponent[fmt][0] = signed'({1'b0, operand_a_q[MAN_BITS+:EXP_BITS]});
+      assign fmt_vsum_mantissa[fmt][1] = {info_vsum_q[fmt][1].is_normal, operand_c_q[MAN_BITS-1:0]}
+                                         << (SUPER_DST_MAN_BITS - MAN_BITS);
+      assign fmt_vsum_mantissa[fmt][0] = {info_vsum_q[fmt][0].is_normal, operand_a_q[MAN_BITS-1:0]}
+                                         << (SUPER_DST_MAN_BITS - MAN_BITS);
+    end else begin : inactive_dst_format
+      assign info_vsum_q[fmt]       = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_vsum_sign[fmt]     = fpnew_pkg::DONT_CARE;             // format disabled
+      assign fmt_vsum_exponent[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_vsum_mantissa[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+    end
+  end
+
+  // -------------------
+  // Destination operand
+  // -------------------
+  logic        [NUM_FORMATS-1:0]                         fmt_dst_sign;
+  logic signed [NUM_FORMATS-1:0][SUPER_DST_EXP_BITS-1:0] fmt_dst_exponent;
+  logic        [NUM_FORMATS-1:0][SUPER_DST_MAN_BITS-1:0] fmt_dst_mantissa;
+
+  // FP Input initialization (Src)
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_dst_init_inputs
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (DstDotpFpFmtConfig[fmt]) begin : active_dst_format
+      logic [FP_WIDTH-1:0] trimmed_dst_ops;
+
+      // Classify input
+      fpnew_classifier #(
+        .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+        .NumOperands ( 1                            )
+      ) i_fpnew_classifier (
+        .operands_i ( trimmed_dst_ops                           ),
+        .is_boxed_i ( inp_pipe_is_boxed_q[NUM_INP_REGS][fmt][4] ),
+        .info_o     ( info_q[fmt][4]                            )
+      );
+      assign trimmed_dst_ops       = dst_operands_q[FP_WIDTH-1:0];
+      assign fmt_dst_sign[fmt]     = dst_operands_q[FP_WIDTH-1];
+      assign fmt_dst_exponent[fmt] = signed'({1'b0, dst_operands_q[MAN_BITS+:EXP_BITS]});
+      assign fmt_dst_mantissa[fmt] = {info_q[fmt][4].is_normal, dst_operands_q[MAN_BITS-1:0]}
+                                      << (SUPER_DST_MAN_BITS - MAN_BITS); // move to left of mantissa
+    end else begin : inactive_dst_format
+      assign info_q[fmt][4]        = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_dst_sign[fmt]     = fpnew_pkg::DONT_CARE;             // format disabled
+      assign fmt_dst_exponent[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_dst_mantissa[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+    end
+  end
+
+  // -------------------------------------------
+  // Operation selection and operand adjustment
+  // -------------------------------------------
+  fp_src_t             operand_a, operand_b, operand_c, operand_d;
+  fp_dst_t             operand_e;
+  fp_dst_t             operand_a_vsum, operand_c_vsum;
+  fpnew_pkg::fp_info_t info_a, info_b, info_c, info_d, info_e;
+  logic                a_sign, c_sign;
+
+  // | \c op_q  | \c op_mod_q | Operation Adjustment
+  // |:--------:|:-----------:|---------------------
+  // | SDOTP    | \c 0        | SDOTP:  none
+  // | SDOTP    | \c 1        | SDOTPN: Invert the sign of the first and second products (accumulator - dotp)
+  // | EXVSUM   | \c 0        | EXVSUM: none
+  // | EXVSUM   | \c 1        | EXVSUM: Invert the sign of the first and second addends
+  // | VSUM     | \c 0        | VSUM:   none
+  // | VSUM     | \c 1        | VSUM:   Invert the sign of the first and second addends
+  // | *others* | \c -        | *invalid*
+  // \note \c op_mod_q always inverts the sign of the addend.
+  always_comb begin : op_select
+    // Default assignments - packing-order-agnostic
+    operand_a = {fmt_sign[src_fmt_q][0], fmt_exponent[src_fmt_q][0], fmt_mantissa[src_fmt_q][0]};
+    operand_b = {fmt_sign[src_fmt_q][1], fmt_exponent[src_fmt_q][1], fmt_mantissa[src_fmt_q][1]};
+    operand_c = {fmt_sign[src_fmt_q][2], fmt_exponent[src_fmt_q][2], fmt_mantissa[src_fmt_q][2]};
+    operand_d = {fmt_sign[src_fmt_q][3], fmt_exponent[src_fmt_q][3], fmt_mantissa[src_fmt_q][3]};
+    operand_e = {fmt_dst_sign[dst_fmt_q], fmt_dst_exponent[dst_fmt_q], fmt_dst_mantissa[dst_fmt_q]};
+    operand_a_vsum = {fmt_vsum_sign[src_fmt_q][0], fmt_vsum_exponent[src_fmt_q][0], fmt_vsum_mantissa[src_fmt_q][0]};
+    operand_c_vsum = {fmt_vsum_sign[src_fmt_q][1], fmt_vsum_exponent[src_fmt_q][1], fmt_vsum_mantissa[src_fmt_q][1]};
+    info_a    = info_q[src_fmt_q][0];
+    info_b    = info_q[src_fmt_q][1];
+    info_c    = info_q[src_fmt_q][2];
+    info_d    = info_q[src_fmt_q][3];
+    info_e    = info_q[dst_fmt_q][4];
+
+    // op_mod_q inverts sign of operand A and C, thus inverting the sign of the dot product
+    operand_a.sign = operand_a.sign ^ inp_pipe_op_mod_q[NUM_INP_REGS];
+    operand_c.sign = operand_c.sign ^ inp_pipe_op_mod_q[NUM_INP_REGS];
+    a_sign    = operand_a.sign;
+    c_sign    = operand_c.sign;
+    // op_mod_q inverts sign of operand A and C, thus inverting the sign of the vsum
+    operand_a_vsum.sign = operand_a_vsum.sign ^ inp_pipe_op_mod_q[NUM_INP_REGS];
+    operand_c_vsum.sign = operand_c_vsum.sign ^ inp_pipe_op_mod_q[NUM_INP_REGS];
+
+    unique case (inp_pipe_op_q[NUM_INP_REGS])
+      fpnew_pkg::SDOTP:  ; // do nothing
+      fpnew_pkg::VSUM: begin // Set multiplicands coming from rs1 to +1
+        operand_b = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
+        operand_d = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
+        info_b    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
+        info_d    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
+        info_a    = info_vsum_q[dst_fmt_q][0];
+        info_c    = info_vsum_q[dst_fmt_q][1];
+        a_sign    = operand_a_vsum.sign;
+        c_sign    = operand_c_vsum.sign;
+      end
+      fpnew_pkg::EXVSUM: begin // Set multiplicands coming from rs1 to +1
+        operand_b = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
+        operand_d = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
+        info_b    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
+        info_d    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
+      end
+      default: begin // propagate don't cares
+        operand_a  = '{default: fpnew_pkg::DONT_CARE};
+        operand_b  = '{default: fpnew_pkg::DONT_CARE};
+        operand_c  = '{default: fpnew_pkg::DONT_CARE};
+        info_a     = '{default: fpnew_pkg::DONT_CARE};
+        info_b     = '{default: fpnew_pkg::DONT_CARE};
+        info_c     = '{default: fpnew_pkg::DONT_CARE};
+      end
+    endcase
+  end
+
+  // ---------------------
+  // Input classification
+  // ---------------------
+  logic       any_operand_inf;
+  logic       any_operand_nan;
+  logic       signalling_nan;
+  logic [2:0] effective_subtraction;
+  logic       tentative_sign;
+
+  // Reduction for special case handling
+  assign any_operand_inf = (| {info_a.is_inf, info_b.is_inf, info_c.is_inf, info_d.is_inf, info_e.is_inf});
+  assign any_operand_nan = (| {info_a.is_nan, info_b.is_nan, info_c.is_nan, info_d.is_nan, info_e.is_nan});
+  assign signalling_nan  = (| {info_a.is_signalling, info_b.is_signalling, info_c.is_signalling,
+                               info_d.is_signalling, info_e.is_signalling});
+  // Effective subtractions in the three-term addition
+  assign effective_subtraction[0] = (a_sign ^ operand_b.sign) ^ operand_e.sign;
+  assign effective_subtraction[1] = (c_sign ^ operand_d.sign) ^ operand_e.sign;
+  assign effective_subtraction[2] = (a_sign ^ operand_b.sign) ^ (c_sign ^ operand_d.sign);
+
+  // ----------------------
+  // Special case handling
+  // ----------------------
+  logic [DST_WIDTH-1:0] special_result;
+  fpnew_pkg::status_t   special_status;
+  logic                 result_is_special;
+
+  logic               [NUM_FORMATS-1:0][DST_WIDTH-1:0] fmt_special_result;
+  fpnew_pkg::status_t [NUM_FORMATS-1:0]                fmt_special_status;
+  logic               [NUM_FORMATS-1:0]                fmt_result_is_special;
+
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_special_results
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = '1;
+    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    localparam logic [MAN_BITS-1:0] ZERO_MANTISSA = '0;
+
+    if (DstDotpFpFmtConfig[fmt]) begin : active_format
+      always_comb begin : special_cases
+        logic [FP_WIDTH-1:0] special_res;
+
+        // Default assignment
+        special_res                = {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+        fmt_special_status[fmt]    = '0;
+        fmt_result_is_special[fmt] = 1'b0;
+
+        // Handle potentially mixed nan & infinity input => important for the case where infinity and
+        // zero are multiplied and added to a qNaN.
+        // RISC-V mandates raising the NV exception in these cases:
+        // (inf * 0) + c or (0 * inf) + c INVALID, no matter c (even quiet NaNs)
+        if (  ((info_a.is_inf && info_b.is_zero) || (info_a.is_zero && info_b.is_inf))
+           || ((info_c.is_inf && info_d.is_zero) || (info_c.is_zero && info_d.is_inf)) ) begin
+          fmt_result_is_special[fmt] = 1'b1; // bypass DOTP, output is the canonical qNaN
+          fmt_special_status[fmt].NV = 1'b1; // invalid operation
+        // NaN Inputs cause canonical quiet NaN at the output and maybe invalid OP
+        end else if (any_operand_nan) begin
+          fmt_result_is_special[fmt] = 1'b1;           // bypass DOTP, output is the canonical qNaN
+          fmt_special_status[fmt].NV = signalling_nan; // raise the invalid operation flag if signalling
+        // Special cases involving infinity
+        end else if (any_operand_inf) begin
+          fmt_result_is_special[fmt] = 1'b1; // bypass DOTP
+          // Effective addition of opposite infinities (±inf - ±inf) is invalid!
+          if ((info_a.is_inf || info_b.is_inf) && (info_c.is_inf || info_d.is_inf) && effective_subtraction[2]) begin
+            fmt_special_status[fmt].NV = 1'b1; // invalid operation
+          end else if (((info_a.is_inf || info_b.is_inf) && info_e.is_inf && effective_subtraction[0])
+             || ((info_c.is_inf || info_d.is_inf) && info_e.is_inf && effective_subtraction[1])) begin
+            fmt_special_status[fmt].NV = 1'b1; // invalid operation
+          // Handle cases where output will be inf because of inf product input
+          end else if (info_a.is_inf || info_b.is_inf) begin
+            // Result is infinity with the sign of the first product
+            special_res = {a_sign ^ operand_b.sign, QNAN_EXPONENT, ZERO_MANTISSA};
+          // Handle cases where the second product is inf
+          end else if (info_c.is_inf || info_d.is_inf) begin
+            // Result is infinity with sign of the second product
+            special_res    = {c_sign ^ operand_d.sign, QNAN_EXPONENT, ZERO_MANTISSA};
+          end else if (info_e.is_inf) begin
+            // Result is infinity with sign of the accumulator
+            special_res    = {operand_e.sign, QNAN_EXPONENT, ZERO_MANTISSA};
+          end
+        end
+        // Initialize special result with ones (NaN-box)
+        fmt_special_result[fmt]               = '1;
+        fmt_special_result[fmt][FP_WIDTH-1:0] = special_res;
+      end
+    end else begin : inactive_format
+      assign fmt_special_result[fmt] = '{default: fpnew_pkg::DONT_CARE};
+      assign fmt_special_status[fmt] = '0;
+      assign fmt_result_is_special[fmt] = 1'b0;
+    end
+  end
+
+  // Detect special case from source format
+  assign result_is_special = fmt_result_is_special[dst_fmt_q];
+  // Signalling input NaNs raise invalid flag, otherwise no flags set
+  assign special_status = fmt_special_status[dst_fmt_q];
+  // Assemble result according to destination format
+  assign special_result = fmt_special_result[dst_fmt_q];
+
+  // ---------------------------
+  // Initial exponent data path
+  // ---------------------------
+  logic signed [EXP_WIDTH-1:0]     exponent_a, exponent_b, exponent_c, exponent_d;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_e;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_a_vsum, exponent_c_vsum;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_addend_x, exponent_addend_y, exponent_addend_z;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_product_x, exponent_product_y, exponent_difference;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_max, exponent_int, exponent_min;
+  logic signed [DST_EXP_WIDTH-1:0] tentative_exponent;
+  logic [2:0]                      exponent_cmp;
+  logic                            effective_subtraction_first;
+  logic                            info_min_is_zero;
+  logic                            info_int_is_zero;
+  logic                            info_max_is_zero;
+  logic                            addend_min_sign;
+  logic                            addend_int_sign;
+  logic                            addend_max_sign;
+
+  // Zero-extend exponents into signed container - implicit width extension
+  assign exponent_a = signed'({1'b0, operand_a.exponent});
+  assign exponent_a_vsum = signed'({1'b0, operand_a_vsum.exponent});
+  assign exponent_b = signed'({1'b0, operand_b.exponent});
+  assign exponent_c = signed'({1'b0, operand_c.exponent});
+  assign exponent_c_vsum = signed'({1'b0, operand_c_vsum.exponent});
+  assign exponent_d = signed'({1'b0, operand_d.exponent});
+  assign exponent_e = signed'({1'b0, operand_e.exponent});
+
+  // Calculate internal exponents from encoded values. Real exponents are (ex = Ex - bias + 1 - nx)
+  // with Ex the encoded exponent and nx the implicit bit. Internal exponents stay biased.
+  // Biased product exponent is the sum of encoded exponents minus the bias.
+  assign exponent_product_y = (info_c.is_zero || info_d.is_zero)
+                              ? 2 - signed'(fpnew_pkg::bias(dst_fmt_q)) // in case the product is zero, set minimum exp.
+                              : signed'(exponent_c + info_c.is_subnormal
+                                        + exponent_d + info_d.is_subnormal
+                                        - 2*signed'(fpnew_pkg::bias(src_fmt_q))  // rebias for dst fmt
+                                        + signed'(fpnew_pkg::bias(dst_fmt_q)) + 1); // adding +1 to keep into account following shifts
+  assign exponent_product_x = (info_a.is_zero || info_b.is_zero)
+                              ? 2 - signed'(fpnew_pkg::bias(dst_fmt_q)) // in case the product is zero, set minimum exp.
+                              : signed'(exponent_a + info_a.is_subnormal
+                                        + exponent_b + info_b.is_subnormal
+                                        - 2*signed'(fpnew_pkg::bias(src_fmt_q))  // rebias for dst fmt
+                                        + signed'(fpnew_pkg::bias(dst_fmt_q)) + 1); // adding +1 to keep into account following shift
+  assign exponent_addend_y = (inp_pipe_op_q[NUM_INP_REGS] == fpnew_pkg::VSUM)
+                             ? signed'(exponent_c_vsum + $signed({1'b0, ~info_c.is_normal}))
+                             : exponent_product_y;
+  assign exponent_addend_x = (inp_pipe_op_q[NUM_INP_REGS] == fpnew_pkg::VSUM)
+                             ? signed'(exponent_a_vsum + $signed({1'b0, ~info_a.is_normal}))
+                             : exponent_product_x;
+  assign exponent_addend_z = signed'(exponent_e + $signed({1'b0, ~info_e.is_normal})); // 0 as subnorm
+
+  // Find maximum, intermediate and minimum exponents
+  assign exponent_cmp[2] = (exponent_addend_x >= exponent_addend_y) ? 1'b1 : 1'b0;
+  assign exponent_cmp[1] = (exponent_addend_x >= exponent_addend_z) ? 1'b1 : 1'b0;
+  assign exponent_cmp[0] = (exponent_addend_y >= exponent_addend_z) ? 1'b1 : 1'b0;
+
+  // The three-term addition is performed in two steps with only a final normalization and round step
+  // To prevent precision loss, first the two largest addends are summed, then the minimum addend is
+  // added to the result of the first addition.
+
+  // Find maximum, intermediate and minimum exponent
+  always_comb begin : compare_exponents
+    case (exponent_cmp)
+      // (x < y), (x < z), (y < z)
+      3'b000  : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_z, exponent_addend_y, exponent_addend_x};
+        tentative_sign   = operand_e.sign; // The tentative sign of the DOTP shall be the sign of the maximum addend
+        effective_subtraction_first = effective_subtraction[1];
+        info_min_is_zero = info_a.is_zero || info_b.is_zero;
+        info_int_is_zero = info_c.is_zero || info_d.is_zero;
+        info_max_is_zero = info_e.is_zero;
+        addend_min_sign  = a_sign ^ operand_b.sign;
+        addend_int_sign  = c_sign ^ operand_d.sign;
+        addend_max_sign  = operand_e.sign;
+      end
+      // // (x < y), (x < z), (y >= z) --> y >= z > x
+      3'b001  : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_y, exponent_addend_z, exponent_addend_x};
+        tentative_sign   = (c_sign ^ operand_d.sign);
+        effective_subtraction_first = effective_subtraction[1];
+        info_min_is_zero = info_a.is_zero || info_b.is_zero;
+        info_int_is_zero = info_e.is_zero;
+        info_max_is_zero = info_c.is_zero || info_d.is_zero;
+        addend_min_sign  = a_sign ^ operand_b.sign;
+        addend_int_sign  = operand_e.sign;
+        addend_max_sign  = c_sign ^ operand_d.sign;
+      end
+      // // (x < y), (x >= z), (y < z)
+      // 3'b010  : IMPOSSIBLE
+      // (x < y), (x >= z), (y >= z)
+      3'b011  : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_y, exponent_addend_x, exponent_addend_z};
+        tentative_sign   =  (c_sign ^ operand_d.sign);
+        effective_subtraction_first = effective_subtraction[2];
+        info_min_is_zero = info_e.is_zero;
+        info_int_is_zero = info_a.is_zero || info_b.is_zero;
+        info_max_is_zero = info_c.is_zero || info_d.is_zero;
+        addend_min_sign  = operand_e.sign;
+        addend_int_sign  = a_sign ^ operand_b.sign;
+        addend_max_sign  = c_sign ^ operand_d.sign;
+      end
+      // (x >= y), (x < z), (y < z)
+      3'b100  : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_z, exponent_addend_x, exponent_addend_y};
+        tentative_sign   = operand_e.sign;
+        effective_subtraction_first = effective_subtraction[0];
+        info_min_is_zero = info_c.is_zero || info_d.is_zero;
+        info_int_is_zero = info_a.is_zero || info_b.is_zero;
+        info_max_is_zero = info_e.is_zero;
+        addend_min_sign  = c_sign ^ operand_d.sign;
+        addend_int_sign  = a_sign ^ operand_b.sign;
+        addend_max_sign  = operand_e.sign;
+      end
+      // // (x >= y), (x < z), (y >= z)
+      // 3'b101  : IMPOSSIBLE
+      3'b110  : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_x, exponent_addend_z, exponent_addend_y};
+        tentative_sign   = (a_sign ^ operand_b.sign);
+        effective_subtraction_first = effective_subtraction[0];
+        info_min_is_zero = info_c.is_zero || info_d.is_zero;
+        info_int_is_zero = info_e.is_zero;
+        info_max_is_zero = info_a.is_zero || info_b.is_zero;
+        addend_min_sign  = c_sign ^ operand_d.sign;
+        addend_int_sign  = operand_e.sign;
+        addend_max_sign  = a_sign ^ operand_b.sign;
+      end
+      // (x >= y), (x >= z), (y >= z)
+      3'b111  : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_x, exponent_addend_y, exponent_addend_z};
+        tentative_sign   = (a_sign ^ operand_b.sign);
+        effective_subtraction_first = effective_subtraction[2];
+        info_min_is_zero = info_e.is_zero;
+        info_int_is_zero = info_c.is_zero || info_d.is_zero;
+        info_max_is_zero = info_a.is_zero || info_b.is_zero;
+        addend_min_sign  = operand_e.sign;
+        addend_int_sign  = c_sign ^ operand_d.sign;
+        addend_max_sign  = a_sign ^ operand_b.sign;
+      end
+      default : begin
+        {exponent_max, exponent_int, exponent_min} = {exponent_addend_x, exponent_addend_y, exponent_addend_z};
+        tentative_sign   = (a_sign ^ operand_b.sign);
+        effective_subtraction_first = effective_subtraction[2];
+        info_min_is_zero = info_e.is_zero;
+        info_int_is_zero = info_c.is_zero || info_d.is_zero;
+        info_max_is_zero = info_a.is_zero || info_b.is_zero;
+        addend_min_sign  = operand_e.sign;
+        addend_int_sign  = c_sign ^ operand_d.sign;
+        addend_max_sign  = a_sign ^ operand_b.sign;
+      end
+    endcase
+  end
+
+  // Exponent difference is the maximum addend exponent minus the intermediate addend exponent,
+  // where the addends are selected among the two products and the accumulator.
+  // In the case of non-expanding VSUM, the two products are replaced by the larger inputs (the
+  // multipliers are by-passed
+  assign exponent_difference = exponent_max - exponent_int;
+  // The tentative exponent will be the maximum exponent
+  assign tentative_exponent = exponent_max;
+
+  // Shift amount for product_y based on exponents (unsigned as only right shifts)
+  logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt;
+  always_comb begin : addend_shift_amount
+    // The maximum addend and the intermediate addends have mutual bits to add
+    if (exponent_difference <= signed'(2*DST_PRECISION_BITS + 3)) begin
+      addend_shamt = unsigned'(signed'(exponent_difference));
+    // The intermediate addend is only in the sticky bits
+    end else begin
+      addend_shamt = 2*DST_PRECISION_BITS + 3;
+    end
+  end
+
+  // ------------------
+  // Product data path
+  // ------------------
+  logic     [PRECISION_BITS-1:0] mantissa_a, mantissa_b, mantissa_c, mantissa_d;
+  logic [DST_PRECISION_BITS-1:0] mantissa_e;
+  logic [DST_PRECISION_BITS-1:0] mantissa_a_vsum, mantissa_c_vsum;
+  logic   [2*PRECISION_BITS-1:0] product_x, product_y;  // the p*p product is 2p-bit wide
+
+  // Add implicit bits to mantissae
+  assign mantissa_a = {info_a.is_normal, operand_a.mantissa};
+  assign mantissa_b = {info_b.is_normal, operand_b.mantissa};
+  assign mantissa_c = {info_c.is_normal, operand_c.mantissa};
+  assign mantissa_d = {info_d.is_normal, operand_d.mantissa};
+  assign mantissa_e = {info_e.is_normal, operand_e.mantissa};
+
+  assign mantissa_a_vsum = {info_a.is_normal, operand_a_vsum.mantissa};
+  assign mantissa_c_vsum = {info_c.is_normal, operand_c_vsum.mantissa};
+
+  // Mantissa multiplier (a*b)
+  assign product_x = mantissa_a * mantissa_b;
+  // Mantissa multiplier (c*d)
+  assign product_y = mantissa_c * mantissa_d;
+
+  // ------------------
+  // Shift data path
+  // ------------------
+  // The three addends are DST_PRECISION_BITS-wide since they might contain a product, which is
+  // expressed with 2*PRECISION_BITS (< DST_PRECISION_BITS), or the accumulator which is expressed
+  // with DST_PRECISION_BITS. In the case of non-expanding VSUM, all the operands are
+  // DST_PRECISION_BITS-wide, if the largest format allowed is selected, or boxed into
+  // DST_PRECISION_BITS, if a narrower format is selected.
+  logic   [DST_PRECISION_BITS-1:0] addend_x, addend_y, addend_z;
+  logic   [DST_PRECISION_BITS-1:0] addend_max, addend_int, addend_min;
+  logic [2*DST_PRECISION_BITS+2:0] addend_max_shifted;
+  logic [2*DST_PRECISION_BITS+2:0] addend_int_after_shift;
+  logic   [DST_PRECISION_BITS-1:0] addend_sticky_bits;
+  logic                            sticky_before_add;
+  logic [2*DST_PRECISION_BITS+2:0] addend_int_shifted;
+  logic                            inject_carry_in;     // inject carry for subtractions if needed
+
+  // Bypass the multipliers in case of non-expanding VSUM
+  // Place the products in the upper part of the addend in case of expanding operations (The addend
+  // uses DST_PRECISION_BITS while 2*PRECISION_BITS might be narrower)
+  assign addend_x = (inp_pipe_op_q[NUM_INP_REGS] == fpnew_pkg::VSUM)
+                      ? mantissa_a_vsum : product_x << ADDITIONAL_PRECISION_BITS;
+  assign addend_y = (inp_pipe_op_q[NUM_INP_REGS] == fpnew_pkg::VSUM)
+                      ? mantissa_c_vsum : product_y << ADDITIONAL_PRECISION_BITS;
+  assign addend_z = mantissa_e;
+
+  // Sorting the addends
+  always_comb begin : sort_addends
+    case (exponent_cmp)
+      // (x < y), (x < z), (y < z)
+      3'b000  : {addend_max, addend_int, addend_min} = {addend_z, addend_y, addend_x};
+      // (x < y), (x >= z), (y < z)
+      3'b001  : {addend_max, addend_int, addend_min} = {addend_y, addend_z, addend_x};
+      // // (x < y), (x < z), (y >= z) => IMPOSSIBLE
+      // 3'b010  : IMPOSSIBLE
+      // (x < y), (x >= z), (y >= z)
+      3'b011  : {addend_max, addend_int, addend_min} = {addend_y, addend_x, addend_z};
+      // (x >= y), (x < z), (y < z)
+      3'b100  : {addend_max, addend_int, addend_min} = {addend_z, addend_x, addend_y};
+      // // (x >= y), (x < z), (y >= z) => IMPOSSIBLE
+      // 3'b101  : IMPOSSIBLE
+      // (x >= y), (x >= z), (y < z)
+      3'b110  : {addend_max, addend_int, addend_min} = {addend_x, addend_z, addend_y};
+      // (x >= y), (x >= z), (y >= z)
+      3'b111  : {addend_max, addend_int, addend_min} = {addend_x, addend_y, addend_z};
+      default : {addend_max, addend_int, addend_min} = {addend_x, addend_y, addend_z};
+    endcase
+  end
+
+  // Product max is placed into a 2p+3 bit wide vector. It is padded with 3 bits for rounding purposes:
+  // | product_max  |  rnd  |
+  //  <-  2p_dst  -> <  3   >
+  assign addend_max_shifted = addend_max << (3 + DST_PRECISION_BITS); // constant shift
+
+  // In parallel, the min product is right-shifted according to the exponent difference. Up to p_dst
+  // bits are shifted out and compressed into a sticky bit.
+  // BEFORE THE SHIFT:
+  // | addend_int | 000.....000 |
+  //  <- p_dst  -> <- p_dst+3 ->
+  // AFTER THE SHIFT:
+  // | 000..........000 | addend_min | 000..................0GR |    sticky bits    |
+  //  <- addend_shamt -> <- p_dst  -> <- p_dst+3-addend_shamt -> <-  up to p_dst  ->
+  assign {addend_int_after_shift, addend_sticky_bits} =
+      (addend_int << (2*DST_PRECISION_BITS + 3)) >> addend_shamt;
+
+  assign sticky_before_add     = (| addend_sticky_bits);
+
+  // In case of a subtraction, the addend is inverted
+  assign addend_int_shifted  = (effective_subtraction_first) ? ~addend_int_after_shift : addend_int_after_shift;
+  assign inject_carry_in = effective_subtraction_first & ~sticky_before_add;
+
+  // ------
+  // Adder
+  // ------
+  logic [2*DST_PRECISION_BITS+3:0] sum_raw;   // added one bit for the carry
+  logic                            sum_carry; // observe carry bit from sum for sign fixing
+  logic [2*DST_PRECISION_BITS+2:0] sum;       // discard carry
+  logic                            final_sign;
+  logic                            sum_exact_zero;
+
+  // Mantissa adder (addend_max + addend_int)
+  assign sum_raw = addend_max_shifted + addend_int_shifted + inject_carry_in;
+  assign sum_carry = sum_raw[2*DST_PRECISION_BITS+3];
+
+  // Complement negative sum (can only happen in subtraction -> overflows for positive results)
+  assign sum        = (effective_subtraction_first && ~sum_carry) ? -sum_raw : sum_raw;
+
+  // Check whether the result is an exact zero for rounding purposes (needed to set the sign of a
+  // final result equal to zero)
+  assign sum_exact_zero = (sum == '0) && sum_carry && !sticky_before_add && effective_subtraction_first;
+  // In case of a mispredicted subtraction result, do a sign flip
+  assign final_sign = sum_exact_zero ? (inp_pipe_rnd_mode_q[NUM_INP_REGS] == fpnew_pkg::RDN)
+                                        : (effective_subtraction_first && (sum_carry == tentative_sign))
+                                              ? 1'b1
+                                              : (effective_subtraction_first ? 1'b0 : tentative_sign);
+
+  // -------------
+  // Second Shift
+  // -------------
+  logic signed [DST_EXP_WIDTH-1:0] exponent_difference_z;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_w;
+  logic signed [DST_EXP_WIDTH-1:0] tentative_exponent_z;
+
+  // W comes from the first addition. Adding +1 to take into account the following shift
+  assign exponent_w = signed'(tentative_exponent + 1);
+  // Exponent difference is the exponent of the first addition result (W) minus the minimum exponent
+  assign exponent_difference_z = exponent_w - exponent_min;
+  // The tentative exponent will be the larger of W exponent or the minimum exponent
+  assign tentative_exponent_z  = exponent_w;
+
+  // Shift amount for addend based on exponents (unsigned as only right shifts)
+  logic [DST_SHIFT_AMOUNT_WIDTH-1:0] addend_shamt_z;
+  logic   [2*DST_PRECISION_BITS+PRECISION_BITS+3:0] addend_min_after_shift;
+  logic     [DST_PRECISION_BITS-1:0] addend_sticky_bits_z;  // up to p_dst bit of shifted addend are sticky
+  logic                              sticky_before_add_z;   // they are compressed into a single sticky bit
+
+  always_comb begin : addend_shift_amount_z
+    // The result of the first addition and the minimum addends have mutual bits to add
+    if (exponent_difference_z <= signed'(2 * DST_PRECISION_BITS + PRECISION_BITS + 4)) begin
+      addend_shamt_z = unsigned'(signed'(exponent_difference_z));
+    // The minimum addend is only in the sticky bits
+    end else begin
+      addend_shamt_z = 2 * DST_PRECISION_BITS + PRECISION_BITS + 4;
+    end
+  end
+
+  // Shift the minimum addend
+  // BEFORE THE SHIFT:
+  // | addend_min | 000.....000 |
+  //  <- p_dst  -> <- p_dst+4 ->
+  // AFTER THE SHIFT:
+  // | 000............000 | addend_min | 000.....................0GR |    sticky bits    |
+  //  <- addend_shamt_z -> <- p_dst  -> <- p_dst+4-addend_shamt_z -> <-  up to p_dst  ->
+  assign {addend_min_after_shift, addend_sticky_bits_z} =
+      (addend_min << (2 * DST_PRECISION_BITS + PRECISION_BITS + 4)) >> addend_shamt_z;
+
+  assign sticky_before_add_z     = (| addend_sticky_bits_z);
+
+  // In case of result of both the first and second addition zero, some more checks need to be
+  // performed to select the right final sign.
+  logic final_sign_zero;
+  always_comb begin
+    final_sign_zero = addend_max_sign;
+    if (info_max_is_zero && !info_int_is_zero && !info_min_is_zero) begin
+      if (exponent_int > exponent_min) begin
+        final_sign_zero = addend_int_sign;
+      end else if (addend_int > addend_min) begin
+        final_sign_zero = addend_int_sign;
+      end else if (addend_int == addend_min) begin
+        final_sign_zero = (addend_max_sign) ? addend_int_sign | addend_min_sign : addend_int_sign & addend_min_sign;
+      end else begin
+        final_sign_zero = addend_min_sign;
+      end
+    end else if (info_max_is_zero && info_int_is_zero && !info_min_is_zero) begin
+      final_sign_zero = addend_min_sign;
+    end else if (info_max_is_zero && info_int_is_zero && info_min_is_zero) begin
+      final_sign_zero = (addend_max_sign) ? addend_int_sign | addend_min_sign : addend_int_sign & addend_min_sign;
+    end else if (info_max_is_zero && !info_int_is_zero && info_min_is_zero) begin
+      final_sign_zero = addend_int_sign;
+    end
+  end
+
+  // -----------------
+  // Internal pipeline
+  // -----------------
+  // Pipeline output signals as non-arrays
+  logic                            effective_subtraction_first_q;
+  logic                            final_sign_zero_q;
+  logic                            info_min_is_zero_q;
+  logic                            info_max_is_zero_q;
+  logic                            addend_min_sign_q;
+  logic                            sum_exact_zero_q;
+  logic [DST_PRECISION_BITS-1:0]   addend_min_q;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_w_q;
+  logic                            sticky_before_add_z_q;   // they are compressed into a single sticky bit
+  logic [2*DST_PRECISION_BITS+PRECISION_BITS+3:0] addend_min_after_shift_q;
+  logic                            operand_e_sign_q;
+  logic                            product_x_sign_q;
+  logic                            product_y_sign_q;
+  logic [2:0]                      exponent_cmp_q;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_min_q;
+  logic                            sticky_before_add_q;
+  logic [2*DST_PRECISION_BITS+2:0] sum_q;
+  logic                            final_sign_q;
+  fpnew_pkg::fp_format_e           dst_fmt_q2;
+  fpnew_pkg::roundmode_e           rnd_mode_q;
+  logic                            result_is_special_q;
+  fp_dst_t                         special_result_q;
+  fpnew_pkg::status_t              special_status_q;
+  logic                            sum_carry_q;
+  // Internal pipeline signals, index i holds signal after i register stages
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_eff_sub_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_final_sign_zero_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_info_min_is_zero_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_info_max_is_zero_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_addend_min_sign_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_sum_exact_zero_q;
+  logic                  [0:NUM_MID_REGS][DST_PRECISION_BITS-1:0]   mid_pipe_addend_min_q;
+  logic signed           [0:NUM_MID_REGS][DST_EXP_WIDTH-1:0]        mid_pipe_exp_first_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_sticky_before_add_z_q;
+  logic                  [0:NUM_MID_REGS][2*DST_PRECISION_BITS+PRECISION_BITS+3:0] mid_pipe_add_min_after_shift_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_op_e_sign_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_prod_x_sign_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_prod_y_sign_q;
+  logic                  [0:NUM_MID_REGS][2:0]                      mid_pipe_exp_cmp_q;
+  logic signed           [0:NUM_MID_REGS][DST_EXP_WIDTH-1:0]        mid_pipe_exp_min_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_sticky_q;
+  logic                  [0:NUM_MID_REGS][2*DST_PRECISION_BITS+2:0] mid_pipe_sum_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_final_sign_q;
+  fpnew_pkg::fp_format_e [0:NUM_MID_REGS]                           mid_pipe_dst_fmt_q;
+  fpnew_pkg::roundmode_e [0:NUM_MID_REGS]                           mid_pipe_rnd_mode_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_res_is_spec_q;
+  fp_dst_t               [0:NUM_MID_REGS]                           mid_pipe_spec_res_q;
+  fpnew_pkg::status_t    [0:NUM_MID_REGS]                           mid_pipe_spec_stat_q;
+  TagType                [0:NUM_MID_REGS]                           mid_pipe_tag_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_mask_q;
+  AuxType                [0:NUM_MID_REGS]                           mid_pipe_aux_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_valid_q;
+  logic                  [0:NUM_MID_REGS]                           mid_pipe_sum_carry_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_MID_REGS] mid_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from upstream logic
+  assign mid_pipe_eff_sub_q[0]                = effective_subtraction_first;
+  assign mid_pipe_final_sign_zero_q[0]        = final_sign_zero;
+  assign mid_pipe_info_min_is_zero_q[0]       = info_min_is_zero;
+  assign mid_pipe_info_max_is_zero_q[0]       = info_max_is_zero;
+  assign mid_pipe_addend_min_sign_q[0]        = addend_min_sign;
+  assign mid_pipe_sum_exact_zero_q[0]         = sum_exact_zero;
+  assign mid_pipe_addend_min_q[0]             = addend_min;
+  assign mid_pipe_exp_first_q[0]              = exponent_w;
+  assign mid_pipe_sticky_before_add_z_q[0]    = sticky_before_add_z;
+  assign mid_pipe_add_min_after_shift_q[0]    = addend_min_after_shift;
+  assign mid_pipe_op_e_sign_q[0]              = operand_e.sign;
+  assign mid_pipe_prod_x_sign_q[0]            = (a_sign ^ operand_b.sign);
+  assign mid_pipe_prod_y_sign_q[0]            = (c_sign ^ operand_d.sign);
+  assign mid_pipe_exp_cmp_q[0]                = exponent_cmp;
+  assign mid_pipe_exp_min_q[0]                = exponent_min;
+  assign mid_pipe_sticky_q[0]                 = sticky_before_add;
+  assign mid_pipe_sum_q[0]                    = sum;
+  assign mid_pipe_final_sign_q[0]             = final_sign;
+  assign mid_pipe_rnd_mode_q[0]               = inp_pipe_rnd_mode_q[NUM_INP_REGS];
+  assign mid_pipe_dst_fmt_q[0]                = dst_fmt_q;
+  assign mid_pipe_res_is_spec_q[0]            = result_is_special;
+  assign mid_pipe_spec_res_q[0]               = special_result;
+  assign mid_pipe_spec_stat_q[0]              = special_status;
+  assign mid_pipe_tag_q[0]                    = inp_pipe_tag_q[NUM_INP_REGS];
+  assign mid_pipe_mask_q[0]                   = inp_pipe_mask_q[NUM_INP_REGS];
+  assign mid_pipe_aux_q[0]                    = inp_pipe_aux_q[NUM_INP_REGS];
+  assign mid_pipe_valid_q[0]                  = inp_pipe_valid_q[NUM_INP_REGS];
+  assign mid_pipe_sum_carry_q[0]              = sum_carry;
+  // Input stage: Propagate pipeline ready signal to input pipe
+  assign inp_pipe_ready[NUM_INP_REGS]         = mid_pipe_ready[0];
+
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_MID_REGS; i++) begin : gen_inside_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign mid_pipe_ready[i] = mid_pipe_ready[i+1] | ~mid_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(mid_pipe_valid_q[i+1], mid_pipe_valid_q[i], mid_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipleine ready and a valid data item is present
+    assign reg_ena = mid_pipe_ready[i] & mid_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(mid_pipe_eff_sub_q[i+1],             mid_pipe_eff_sub_q[i],             reg_ena, '0)
+    `FFL(mid_pipe_final_sign_zero_q[i+1],     mid_pipe_final_sign_zero_q[i],     reg_ena, '0)
+    `FFL(mid_pipe_info_min_is_zero_q[i+1],    mid_pipe_info_min_is_zero_q[i],    reg_ena, '0)
+    `FFL(mid_pipe_info_max_is_zero_q[i+1],    mid_pipe_info_max_is_zero_q[i],    reg_ena, '0)
+    `FFL(mid_pipe_addend_min_sign_q[i+1],     mid_pipe_addend_min_sign_q[i],     reg_ena, '0)
+    `FFL(mid_pipe_sum_exact_zero_q[i+1],      mid_pipe_sum_exact_zero_q[i],      reg_ena, '0)
+    `FFL(mid_pipe_addend_min_q[i+1],          mid_pipe_addend_min_q[i],          reg_ena, '0)
+    `FFL(mid_pipe_exp_first_q[i+1],           mid_pipe_exp_first_q[i],           reg_ena, '0)
+    `FFL(mid_pipe_sticky_before_add_z_q[i+1], mid_pipe_sticky_before_add_z_q[i], reg_ena, '0)
+    `FFL(mid_pipe_add_min_after_shift_q[i+1], mid_pipe_add_min_after_shift_q[i], reg_ena, '0)
+    `FFL(mid_pipe_op_e_sign_q[i+1],           mid_pipe_op_e_sign_q[i],           reg_ena, '0)
+    `FFL(mid_pipe_prod_x_sign_q[i+1],         mid_pipe_prod_x_sign_q[i],         reg_ena, '0)
+    `FFL(mid_pipe_prod_y_sign_q[i+1],         mid_pipe_prod_y_sign_q[i],         reg_ena, '0)
+    `FFL(mid_pipe_exp_cmp_q[i+1],             mid_pipe_exp_cmp_q[i],             reg_ena, '0)
+    `FFL(mid_pipe_exp_min_q[i+1],             mid_pipe_exp_min_q[i],             reg_ena, '0)
+    `FFL(mid_pipe_sticky_q[i+1],              mid_pipe_sticky_q[i],              reg_ena, '0)
+    `FFL(mid_pipe_sum_q[i+1],                 mid_pipe_sum_q[i],                 reg_ena, '0)
+    `FFL(mid_pipe_final_sign_q[i+1],          mid_pipe_final_sign_q[i],          reg_ena, '0)
+    `FFL(mid_pipe_rnd_mode_q[i+1],            mid_pipe_rnd_mode_q[i],            reg_ena, fpnew_pkg::RNE)
+    `FFL(mid_pipe_dst_fmt_q[i+1],             mid_pipe_dst_fmt_q[i],             reg_ena, fpnew_pkg::FP16)
+    `FFL(mid_pipe_res_is_spec_q[i+1],         mid_pipe_res_is_spec_q[i],         reg_ena, '0)
+    `FFL(mid_pipe_spec_res_q[i+1],            mid_pipe_spec_res_q[i],            reg_ena, '0)
+    `FFL(mid_pipe_spec_stat_q[i+1],           mid_pipe_spec_stat_q[i],           reg_ena, '0)
+    `FFL(mid_pipe_tag_q[i+1],                 mid_pipe_tag_q[i],                 reg_ena, TagType'('0))
+    `FFL(mid_pipe_mask_q[i+1],                mid_pipe_mask_q[i],                reg_ena, '0)
+    `FFL(mid_pipe_aux_q[i+1],                 mid_pipe_aux_q[i],                 reg_ena, AuxType'('0))
+    `FFL(mid_pipe_sum_carry_q[i+1],           mid_pipe_sum_carry_q[i],           reg_ena, '0)
+  end
+  // Output stage: assign selected pipe outputs to signals for later use
+  assign sum_carry_q                   = mid_pipe_sum_carry_q[NUM_MID_REGS];
+  assign addend_min_q                  = mid_pipe_addend_min_q[NUM_MID_REGS];
+  assign final_sign_zero_q             = mid_pipe_final_sign_zero_q[NUM_MID_REGS];
+  assign info_min_is_zero_q            = mid_pipe_info_min_is_zero_q[NUM_MID_REGS];
+  assign info_max_is_zero_q            = mid_pipe_info_max_is_zero_q[NUM_MID_REGS];
+  assign addend_min_sign_q             = mid_pipe_addend_min_sign_q[NUM_MID_REGS];
+  assign sum_exact_zero_q              = mid_pipe_sum_exact_zero_q[NUM_MID_REGS];
+  assign effective_subtraction_first_q = mid_pipe_eff_sub_q[NUM_MID_REGS];
+  assign exponent_w_q                  = mid_pipe_exp_first_q[NUM_MID_REGS];
+  assign sticky_before_add_z_q         = mid_pipe_sticky_before_add_z_q[NUM_MID_REGS];
+  assign addend_min_after_shift_q      = mid_pipe_add_min_after_shift_q[NUM_MID_REGS];
+  assign operand_e_sign_q              = mid_pipe_op_e_sign_q[NUM_MID_REGS];
+  assign product_x_sign_q              = mid_pipe_prod_x_sign_q[NUM_MID_REGS];
+  assign product_y_sign_q              = mid_pipe_prod_y_sign_q[NUM_MID_REGS];
+  assign exponent_cmp_q                = mid_pipe_exp_cmp_q[NUM_MID_REGS];
+  assign exponent_min_q                = mid_pipe_exp_min_q[NUM_MID_REGS];
+  assign sticky_before_add_q           = mid_pipe_sticky_q[NUM_MID_REGS];
+  assign sum_q                         = mid_pipe_sum_q[NUM_MID_REGS];
+  assign final_sign_q                  = mid_pipe_final_sign_q[NUM_MID_REGS];
+  assign rnd_mode_q                    = mid_pipe_rnd_mode_q[NUM_MID_REGS];
+  assign dst_fmt_q2                    = mid_pipe_dst_fmt_q[NUM_MID_REGS];
+  assign result_is_special_q           = mid_pipe_res_is_spec_q[NUM_MID_REGS];
+  assign special_result_q              = mid_pipe_spec_res_q[NUM_MID_REGS];
+  assign special_status_q              = mid_pipe_spec_stat_q[NUM_MID_REGS];
+
+  // ----------------------------------
+  // Second Step of the Three-way Adder
+  // ----------------------------------
+  // Bypass the first addition in the case of result of the first addition equal to zero and
+  // minimum addend not equal to zero.
+  // Without bypassing, that situation might result in precision loss since the minimum addend is
+  // shifted in parallel with the first sum (i.e. if the minimum addend is much smaller than 0,
+  // it might have been shifted out before knowning that the result of the first addition was 0)
+  logic bypass_w;
+  assign bypass_w = sum_exact_zero_q && !info_min_is_zero_q && sticky_before_add_z_q;
+
+  logic [2*DST_PRECISION_BITS+3:0] mantissa_w;
+  logic [2*DST_PRECISION_BITS+PRECISION_BITS+3:0] mantissa_w_shifted;
+
+  logic                            tentative_sign_z;
+  logic                            effective_subtraction_z;
+  logic signed [DST_EXP_WIDTH-1:0] final_tentative_exponent;
+
+  assign final_tentative_exponent = (bypass_w) ? (exponent_min_q >= 0) ? exponent_min_q : 1'b0
+                                               : exponent_w_q;
+
+  assign mantissa_w = {sum_carry_q && ~effective_subtraction_first_q, sum_q};
+  assign mantissa_w_shifted = mantissa_w << PRECISION_BITS;
+
+  // The tentative sign shall be the sign of the first addition
+  assign tentative_sign_z = (bypass_w) ? addend_min_sign_q : final_sign_q;
+
+  always_comb begin
+    case (exponent_cmp_q)
+      3'b000  :  effective_subtraction_z = product_x_sign_q ^ tentative_sign_z;
+      3'b001  :  effective_subtraction_z = product_x_sign_q ^ tentative_sign_z;
+      3'b011  :  effective_subtraction_z = operand_e_sign_q ^ tentative_sign_z;
+      3'b100  :  effective_subtraction_z = product_y_sign_q ^ tentative_sign_z;
+      3'b110  :  effective_subtraction_z = product_y_sign_q ^ tentative_sign_z;
+      3'b111  :  effective_subtraction_z = operand_e_sign_q ^ tentative_sign_z;
+      default :  effective_subtraction_z = operand_e_sign_q ^ tentative_sign_z;
+    endcase
+  end
+
+  logic [2*DST_PRECISION_BITS+PRECISION_BITS+3:0] addend_min_shifted;
+  logic                            inject_carry_in_z; // inject carry for subtractions if needed
+
+  // In case of a subtraction, the addend is inverted
+  assign addend_min_shifted  = (effective_subtraction_z) ? ~addend_min_after_shift_q : addend_min_after_shift_q;
+  assign inject_carry_in_z = effective_subtraction_z & ~sticky_before_add_z_q;
+
+  // ------
+  // Adder
+  // ------
+  logic [2*DST_PRECISION_BITS+PRECISION_BITS+4:0] sum_raw_z;   // added one bit for the carry
+  logic                            sum_carry_z; // observe carry bit from sum for sign fixing
+  logic [2*DST_PRECISION_BITS+PRECISION_BITS+3:0] sum_z;       // discard carry as sum won't overflow
+  logic                            final_sign_z;
+
+  // Mantissa adder (W+Z)
+  assign sum_raw_z    = (bypass_w) ? (exponent_min_q > 0) ? addend_min_q << (DST_PRECISION_BITS+PRECISION_BITS+4)
+                                                          : (addend_min_q << (DST_PRECISION_BITS+PRECISION_BITS+4))
+                                                            >> signed'(-exponent_min_q+1)
+                                   : mantissa_w_shifted + addend_min_shifted + inject_carry_in_z;
+  assign sum_carry_z  = sum_raw_z[2*DST_PRECISION_BITS +PRECISION_BITS+ 4];
+
+  // Complement negative sum (can only happen in subtraction -> overflows for positive results)
+  assign sum_z        = (effective_subtraction_z && ~sum_carry_z) ? -sum_raw_z : sum_raw_z;
+
+  // In case of a mispredicted subtraction result, do a sign flip
+  assign final_sign_z = (effective_subtraction_z && (sum_carry_z == tentative_sign_z))
+                            ? 1'b1
+                            : (effective_subtraction_z ? 1'b0 : tentative_sign_z);
+
+  // --------------
+  // Normalization
+  // --------------
+  logic        [LZC_SUM_WIDTH-1:0]    sum_lower;              // LZC_SUM_WIDTH bits of sum are searched
+  logic        [LZC_RESULT_WIDTH-1:0] leading_zero_count;     // the number of leading zeroes
+  logic signed [LZC_RESULT_WIDTH:0]   leading_zero_count_sgn; // signed leading-zero count
+  logic                               lzc_zeroes;             // in case only zeroes found
+
+  logic        [DST_SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount
+  logic signed [DST_EXP_WIDTH-1:0]          normalized_exponent;
+
+  logic [2*DST_PRECISION_BITS+PRECISION_BITS+4:0] sum_shifted;       // result after first normalization shift
+  logic     [DST_PRECISION_BITS:0] final_mantissa;    // final mantissa before rounding with round bit
+  logic   [DST_PRECISION_BITS+PRECISION_BITS+2:0] sum_sticky_bits;   // remaining p_dst+3 sticky bits after normalization
+  logic                            sticky_after_norm; // sticky bit after normalization
+
+  logic signed [DST_EXP_WIDTH-1:0] final_exponent;
+
+  assign sum_lower = {(~effective_subtraction_z && sum_carry_z), sum_z};
+
+  // Leading zero counter for cancellations
+  lzc #(
+    .WIDTH ( LZC_SUM_WIDTH   ),
+    .MODE  ( 1               ) // MODE = 1 counts leading zeroes
+  ) i_lzc (
+    .in_i    ( sum_lower          ),
+    .cnt_o   ( leading_zero_count ),
+    .empty_o ( lzc_zeroes         )
+  );
+
+  assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count});
+
+  // Normalization shift amount based on exponents and LZC (unsigned as only left shifts)
+  always_comb begin : norm_shift_amount
+   if ((final_tentative_exponent - leading_zero_count_sgn + 1 > 0) && !lzc_zeroes) begin
+      // Remove the counted zeroes
+      if (leading_zero_count > 0) begin
+        norm_shamt          = leading_zero_count - 1;
+        normalized_exponent = final_tentative_exponent - leading_zero_count_sgn + 1; // account for shift
+      end else begin
+        norm_shamt          = '0;
+        normalized_exponent = final_tentative_exponent;
+      end
+    // Subnormal result
+    end else begin
+      // Cap the shift distance to align mantissa with minimum exponent
+      if (final_tentative_exponent > 0)
+        norm_shamt          = final_tentative_exponent - 1;
+      else
+        norm_shamt          = '0;
+      normalized_exponent = '0; // subnormals encoded as 0
+    end
+  end
+
+  // Do the large normalization shift
+  assign sum_shifted       = sum_lower << norm_shamt;
+
+  // Further 1-bit normalization since the leading-one can be to the left or right of the (non-carry)
+  // MSB of the sum.
+  always_comb begin : small_norm
+    // Default assignment, discarding carry bit
+    {final_mantissa, sum_sticky_bits} = sum_shifted;
+    final_exponent                    = normalized_exponent;
+
+    // The normalized sum has overflown, align right and fix exponent
+    if (sum_shifted[2*DST_PRECISION_BITS+PRECISION_BITS+4]) begin // check the carry bit
+      {final_mantissa, sum_sticky_bits} = sum_shifted >> 1;
+      final_exponent                    = normalized_exponent + 1;
+    // The normalized sum is normal, nothing to do
+    end else if (sum_shifted[2*DST_PRECISION_BITS+PRECISION_BITS+3]) begin // check the sum MSB
+      // do nothing
+    // The normalized sum is still denormal, align left - unless the result is not already subnormal
+    end else if (normalized_exponent > 1) begin
+      {final_mantissa, sum_sticky_bits} = sum_shifted << 1;
+      final_exponent                    = normalized_exponent - 1;
+    // Otherwise we're denormal
+    end else begin
+      final_exponent = '0;
+    end
+  end
+
+  // Update the sticky bit with the shifted-out bits coming from the first addition
+  always_comb begin
+    sticky_after_norm = (| {sum_sticky_bits}) | (sticky_before_add_z_q && ~bypass_w) | sticky_before_add_q;
+    if (sticky_before_add_q && !effective_subtraction_first_q && !sticky_before_add_z_q
+        && effective_subtraction_z && (sum_sticky_bits == '0) && !info_min_is_zero_q) begin
+      sticky_after_norm = 1'b0;
+    end
+    if (sticky_before_add_q && effective_subtraction_first_q && !sticky_before_add_z_q
+       && !effective_subtraction_z && (sum_sticky_bits == '0) && !info_min_is_zero_q) begin
+      sticky_after_norm = 1'b0;
+    end
+  end
+
+  // ----------------------------
+  // Rounding and classification
+  // ----------------------------
+  logic                                             pre_round_sign;
+  logic [SUPER_DST_EXP_BITS+SUPER_DST_MAN_BITS-1:0] pre_round_abs; // absolute value of result before rounding
+  logic [1:0]                                       round_sticky_bits;
+  logic [RSR_PRECISION_BITS-1:0]                    stochastic_rounding_bits; // bits for RSR rounding mode
+
+  logic of_before_round, of_after_round; // overflow
+  logic uf_before_round, uf_after_round; // underflow
+
+  logic [NUM_FORMATS-1:0][SUPER_DST_EXP_BITS+SUPER_DST_MAN_BITS-1:0] fmt_pre_round_abs; // per format
+  logic [NUM_FORMATS-1:0][1:0]                                       fmt_round_sticky_bits;
+  logic [NUM_FORMATS-1:0][RSR_PRECISION_BITS-1:0]                    fmt_stochastic_rounding_bits;// bits for RSR rounding mode
+
+  logic [NUM_FORMATS-1:0]                           fmt_of_after_round;
+  logic [NUM_FORMATS-1:0]                           fmt_uf_after_round;
+
+  logic                                             rounded_sign;
+  logic [SUPER_DST_EXP_BITS+SUPER_DST_MAN_BITS-1:0] rounded_abs; // absolute value of result after rounding
+  logic                                             result_zero;
+
+  // Classification before round. RISC-V mandates checking underflow AFTER rounding
+  assign of_before_round = final_exponent >= 2**(fpnew_pkg::exp_bits(dst_fmt_q2))-1; // infinity exponent is all ones
+  assign uf_before_round = final_exponent == 0;               // exponent for subnormals capped to 0
+
+  // Pack exponent and mantissa into proper rounding form
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_res_assemble
+    // Set up some constants
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned ALL_EXTRA_BITS = SUPER_DST_MAN_BITS-MAN_BITS+1+DST_PRECISION_BITS+PRECISION_BITS+2+1;
+
+    logic [EXP_BITS-1:0] pre_round_exponent;
+    logic [MAN_BITS-1:0] pre_round_mantissa;
+    logic [ALL_EXTRA_BITS-1:0] pre_round_all_extra_bits;
+
+    if (DstDotpFpFmtConfig[fmt]) begin : active_dst_format
+
+      assign pre_round_exponent = (of_before_round) ? 2**EXP_BITS-2 : final_exponent[EXP_BITS-1:0];
+      assign pre_round_mantissa = (of_before_round) ? '1 : final_mantissa[SUPER_DST_MAN_BITS-:MAN_BITS];
+      // Assemble result before rounding. In case of overflow, the largest normal value is set.
+      assign fmt_pre_round_abs[fmt] = {pre_round_exponent, pre_round_mantissa}; // 0-extend
+
+      // Round bit is after mantissa (1 in case of overflow for rounding)
+      assign fmt_round_sticky_bits[fmt][1] = final_mantissa[SUPER_DST_MAN_BITS-MAN_BITS] |
+                                             of_before_round;
+      assign pre_round_all_extra_bits = {final_mantissa[SUPER_DST_MAN_BITS-MAN_BITS:0], sum_sticky_bits};
+      assign fmt_stochastic_rounding_bits[fmt] = (of_before_round) ? '1
+                                                  : pre_round_all_extra_bits[(ALL_EXTRA_BITS-1)-:RSR_PRECISION_BITS];
+
+      // remaining bits in mantissa to sticky (1 in case of overflow for rounding)
+      if (MAN_BITS < SUPER_DST_MAN_BITS) begin : narrow_sticky
+        assign fmt_round_sticky_bits[fmt][0] = (| final_mantissa[SUPER_DST_MAN_BITS-MAN_BITS-1:0]) |
+                                               sticky_after_norm | of_before_round;
+      end else begin : normal_sticky
+        assign fmt_round_sticky_bits[fmt][0] = sticky_after_norm | of_before_round;
+      end
+    end else begin : inactive_format
+      assign fmt_pre_round_abs[fmt] = '{default: fpnew_pkg::DONT_CARE};
+      assign fmt_round_sticky_bits[fmt] = '{default: fpnew_pkg::DONT_CARE};
+      assign fmt_stochastic_rounding_bits[fmt] = '{default: fpnew_pkg::DONT_CARE};
+    end
+  end
+
+  // Assemble result before rounding. In case of overflow, the largest normal value is set.
+  assign pre_round_abs      = fmt_pre_round_abs[dst_fmt_q2];
+
+  // In case of overflow, the round and sticky bits are set for proper rounding
+  assign stochastic_rounding_bits = fmt_stochastic_rounding_bits[dst_fmt_q2];
+  assign round_sticky_bits  = fmt_round_sticky_bits[dst_fmt_q2];
+  assign pre_round_sign     = (info_max_is_zero_q && (pre_round_abs == '0) && (| round_sticky_bits))
+                              ? final_sign_zero_q : final_sign_z;
+
+  logic enable_rsr;
+  assign enable_rsr = (rnd_mode_q == fpnew_pkg::RSR) && (mid_pipe_ready[NUM_MID_REGS]
+                      && mid_pipe_valid_q[NUM_MID_REGS]);
+  // Perform the rounding
+  fpnew_rounding #(
+    .AbsWidth     ( SUPER_DST_EXP_BITS + SUPER_DST_MAN_BITS ),
+    .EnableRSR    ( ENABLE_RSR         ),
+    .RsrPrecision ( RSR_PRECISION_BITS ),
+    .LfsrWidth    ( LFSR_WIDTH         )
+  ) i_fpnew_rounding (
+    .clk_i                      ( clk_i                    ),
+    .rst_ni                     ( rst_ni                   ),
+    .id_i                       ( sdotp_hart_id_i          ),
+    .abs_value_i                ( pre_round_abs            ),
+    .en_rsr_i                   ( enable_rsr               ),
+    .sign_i                     ( pre_round_sign           ),
+    .round_sticky_bits_i        ( round_sticky_bits        ),
+    .stochastic_rounding_bits_i ( stochastic_rounding_bits ),
+    .rnd_mode_i                 ( rnd_mode_q               ),
+    .effective_subtraction_i    ( effective_subtraction_z  ),
+    .abs_rounded_o              ( rounded_abs              ),
+    .sign_o                     ( rounded_sign             ),
+    .exact_zero_o               ( result_zero              )
+  );
+
+  logic [NUM_FORMATS-1:0][DST_WIDTH-1:0] fmt_result;
+
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_sign_inject
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (DstDotpFpFmtConfig[fmt]) begin : active_dst_format
+      always_comb begin : post_process
+        // detect of / uf
+        fmt_uf_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0; // denormal
+        fmt_of_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '1; // inf exp.
+
+        // Assemble regular result, nan box short ones.
+        fmt_result[fmt]               = '1;
+        fmt_result[fmt][FP_WIDTH-1:0] = {rounded_sign, rounded_abs[EXP_BITS+MAN_BITS-1:0]};
+      end
+    end else begin : inactive_format
+      assign fmt_uf_after_round[fmt] = fpnew_pkg::DONT_CARE;
+      assign fmt_of_after_round[fmt] = fpnew_pkg::DONT_CARE;
+      assign fmt_result[fmt]         = '{default: fpnew_pkg::DONT_CARE};
+    end
+  end
+
+  // Classification after rounding select by destination format
+  assign uf_after_round = fmt_uf_after_round[dst_fmt_q2];
+  assign of_after_round = fmt_of_after_round[dst_fmt_q2];
+
+  // -----------------
+  // Result selection
+  // -----------------
+  logic [DST_WIDTH-1:0] regular_result;
+  fpnew_pkg::status_t   regular_status;
+
+  // Assemble regular result
+  assign regular_result    = fmt_result[dst_fmt_q2];
+  assign regular_status.NV = 1'b0; // only valid cases are handled in regular path
+  assign regular_status.DZ = 1'b0; // no divisions
+  assign regular_status.OF = of_before_round | of_after_round;   // rounding can introduce overflow
+  assign regular_status.UF = uf_after_round & regular_status.NX; // only inexact results raise UF
+  assign regular_status.NX = (| round_sticky_bits) | of_before_round | of_after_round;
+
+  // Final results for output pipeline
+  logic [DST_WIDTH-1:0] result_d;
+  fpnew_pkg::status_t   status_d;
+
+  // Select output depending on special case detection
+  assign result_d = result_is_special_q ? special_result_q : regular_result;
+  assign status_d = result_is_special_q ? special_status_q : regular_status;
+
+  // ----------------
+  // Output Pipeline
+  // ----------------
+  // Output pipeline signals, index i holds signal after i register stages
+  logic               [0:NUM_OUT_REGS][DST_WIDTH-1:0] out_pipe_result_q;
+  fpnew_pkg::status_t [0:NUM_OUT_REGS]                out_pipe_status_q;
+  TagType             [0:NUM_OUT_REGS]                out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS]                out_pipe_mask_q;
+  AuxType             [0:NUM_OUT_REGS]                out_pipe_aux_q;
+  logic               [0:NUM_OUT_REGS]                out_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_OUT_REGS] out_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from inputs
+  assign out_pipe_result_q[0] = result_d;
+  assign out_pipe_status_q[0] = status_d;
+  assign out_pipe_tag_q[0]    = mid_pipe_tag_q[NUM_MID_REGS];
+  assign out_pipe_mask_q[0]   = mid_pipe_mask_q[NUM_MID_REGS];
+  assign out_pipe_aux_q[0]    = mid_pipe_aux_q[NUM_MID_REGS];
+  assign out_pipe_valid_q[0]  = mid_pipe_valid_q[NUM_MID_REGS];
+  // Input stage: Propagate pipeline ready signal to inside pipe
+  assign mid_pipe_ready[NUM_MID_REGS] = out_pipe_ready[0];
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_OUT_REGS; i++) begin : gen_output_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign out_pipe_ready[i] = out_pipe_ready[i+1] | ~out_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(out_pipe_valid_q[i+1], out_pipe_valid_q[i], out_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipleine ready and a valid data item is present
+    assign reg_ena = out_pipe_ready[i] & out_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
+    `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
+    `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],   out_pipe_mask_q[i],   reg_ena, '0)
+    `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
+  end
+  // Output stage: Ready travels backwards from output side, driven by downstream circuitry
+  assign out_pipe_ready[NUM_OUT_REGS] = out_ready_i;
+  // Output stage: assign module outputs
+  assign result_o        = out_pipe_result_q[NUM_OUT_REGS];
+  assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
+  assign extension_bit_o = 1'b1; // always NaN-Box result
+  assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
+  assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
+  assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
+  assign busy_o          = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
+endmodule
diff --git a/src/fpnew_sdotp_multi_wrapper.sv b/src/fpnew_sdotp_multi_wrapper.sv
new file mode 100644
index 00000000..83e02681
--- /dev/null
+++ b/src/fpnew_sdotp_multi_wrapper.sv
@@ -0,0 +1,190 @@
+// Copyright 2019-2021 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Author: Gianna Paulin <pauling@iis.ee.ethz.ch>
+// Author: Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+// Author: Stefan Mach <smach@iis.ee.ethz.ch>
+
+`include "common_cells/registers.svh"
+
+module fpnew_sdotp_multi_wrapper #(
+  parameter int unsigned             LaneWidth   = 64,
+  parameter fpnew_pkg::fmt_logic_t   FpFmtConfig = '1,
+  parameter int unsigned             NumPipeRegs = 0,
+  parameter fpnew_pkg::pipe_config_t PipeConfig  = fpnew_pkg::BEFORE,
+  parameter type                     TagType     = logic,
+  parameter type                     AuxType     = logic,
+  parameter fpnew_pkg::rsr_impl_t    StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
+  // Do not change
+  localparam fpnew_pkg::fmt_logic_t FpSrcFmtConfig = FpFmtConfig[0] ? (FpFmtConfig & 6'b001111) : (FpFmtConfig & 6'b000101),
+  localparam fpnew_pkg::fmt_logic_t FpDstFmtConfig = fpnew_pkg::get_dotp_dst_fmts(FpFmtConfig, FpSrcFmtConfig),
+  localparam int                    SRC_WIDTH      = fpnew_pkg::max_fp_width(FpSrcFmtConfig),
+  localparam int                    DST_WIDTH      = 2*fpnew_pkg::max_fp_width(FpSrcFmtConfig), // do not change, current assumption of sdotpex_multi
+  localparam int                    OPERAND_WIDTH  = LaneWidth,
+  localparam int unsigned           NUM_FORMATS    = fpnew_pkg::NUM_FP_FORMATS
+) (
+  input logic                          clk_i,
+  input logic                          rst_ni,
+  input logic [33:0]                   sdotp_hart_id_i,
+  // Input signals
+  input logic [2:0][OPERAND_WIDTH-1:0] operands_i, // 3 operands
+  input logic [NUM_FORMATS-1:0][2:0]   is_boxed_i, // 3 operands
+  input fpnew_pkg::roundmode_e         rnd_mode_i,
+  input fpnew_pkg::operation_e         op_i,
+  input logic                          op_mod_i,
+  input fpnew_pkg::fp_format_e         src_fmt_i,
+  input fpnew_pkg::fp_format_e         dst_fmt_i,
+  input TagType                        tag_i,
+  input logic                          mask_i,
+  input AuxType                        aux_i,
+  // Input Handshake
+  input  logic                         in_valid_i,
+  output logic                         in_ready_o,
+  input  logic                         flush_i,
+  // Output signals
+  output logic [OPERAND_WIDTH-1:0]     result_o,
+  output fpnew_pkg::status_t           status_o,
+  output logic                         extension_bit_o,
+  output TagType                       tag_o,
+  output logic                         mask_o,
+  output AuxType                       aux_o,
+  // Output handshake
+  output logic                         out_valid_o,
+  input  logic                         out_ready_i,
+  // Indication of valid data in flight
+  output logic                         busy_o
+);
+
+  // ----------
+  // Constants
+  // ----------
+  localparam int unsigned N_SRC_FMT_OPERANDS = 4;
+  localparam int unsigned N_DST_FMT_OPERANDS = 1;
+
+  // -----------------
+  // Input processing
+  // -----------------
+  logic                             [NUM_FORMATS-1:0][DST_WIDTH-1:0] local_src_fmt_operand_a;  // lane-local operands
+  logic                             [NUM_FORMATS-1:0][SRC_WIDTH-1:0] local_src_fmt_operand_b;  // lane-local operands
+  logic                             [NUM_FORMATS-1:0][DST_WIDTH-1:0] local_src_fmt_operand_c;  // lane-local operands
+  logic                             [NUM_FORMATS-1:0][SRC_WIDTH-1:0] local_src_fmt_operand_d;  // lane-local operands
+  logic                                              [DST_WIDTH-1:0] local_dst_fmt_operands;  // lane-local operands
+  logic [NUM_FORMATS-1:0][N_SRC_FMT_OPERANDS+N_DST_FMT_OPERANDS-1:0] local_is_boxed;  // lane-local operands
+  logic                                          [OPERAND_WIDTH-1:0] local_result;  // lane-local operands
+
+
+  // ----------------------------------
+  // assign operands with dst format
+  // ----------------------------------
+  assign local_dst_fmt_operands = operands_i[2][DST_WIDTH-1:0];
+
+
+  // ----------------------------------
+  // assign operands with src format
+  // ----------------------------------
+  // NaN-boxing check
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_nanbox
+
+    localparam int unsigned FP_WIDTH         = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned FP_WIDTH_MIN     = fpnew_pkg::minimum(SRC_WIDTH, FP_WIDTH);
+    localparam int unsigned FP_WIDTH_DST_MIN = fpnew_pkg::minimum(DST_WIDTH, FP_WIDTH);
+
+    logic [N_SRC_FMT_OPERANDS-1:0][FP_WIDTH_DST_MIN-1:0] tmp_operands;     // lane-local operands
+
+    always_comb begin : nanbox
+      // shift operands to correct position
+      tmp_operands[0] = operands_i[0] >> 0*FP_WIDTH;
+      tmp_operands[1] = operands_i[1] >> 0*FP_WIDTH;
+      tmp_operands[2] = operands_i[0] >> 1*FP_WIDTH;
+      tmp_operands[3] = operands_i[1] >> 1*FP_WIDTH;
+      // nan-box if needed
+      local_src_fmt_operand_a[fmt] = '1;
+      local_src_fmt_operand_b[fmt] = '1;
+      local_src_fmt_operand_c[fmt] = '1;
+      local_src_fmt_operand_d[fmt] = '1;
+      if (op_i == fpnew_pkg::VSUM) begin
+        local_src_fmt_operand_a[fmt][FP_WIDTH_DST_MIN-1:0] = tmp_operands[0][FP_WIDTH_DST_MIN-1:0];
+        local_src_fmt_operand_b[fmt][FP_WIDTH_MIN-1:0]     = '1;
+        if(FP_WIDTH == LaneWidth) begin
+          local_src_fmt_operand_c[fmt][FP_WIDTH_DST_MIN-1:0] = tmp_operands[1][FP_WIDTH_DST_MIN-1:0];
+        end else begin
+          local_src_fmt_operand_c[fmt][FP_WIDTH_DST_MIN-1:0] = tmp_operands[2][FP_WIDTH_DST_MIN-1:0];
+        end
+        local_src_fmt_operand_d[fmt][FP_WIDTH_MIN-1:0]     = '1;
+      end else begin
+        local_src_fmt_operand_a[fmt][FP_WIDTH_MIN-1:0] = tmp_operands[0][FP_WIDTH_MIN-1:0];
+        local_src_fmt_operand_b[fmt][FP_WIDTH_MIN-1:0] = tmp_operands[1][FP_WIDTH_MIN-1:0];
+        local_src_fmt_operand_c[fmt][FP_WIDTH_MIN-1:0] = tmp_operands[2][FP_WIDTH_MIN-1:0];
+        local_src_fmt_operand_d[fmt][FP_WIDTH_MIN-1:0] = tmp_operands[3][FP_WIDTH_MIN-1:0];
+      end
+      // take is_boxed info from external or set to 1 if boxed for dotp operation
+      local_is_boxed[fmt][0] = is_boxed_i[fmt][0];
+      local_is_boxed[fmt][1] = is_boxed_i[fmt][1];
+      local_is_boxed[fmt][2] = is_boxed_i[fmt][0];
+      local_is_boxed[fmt][3] = is_boxed_i[fmt][1];
+      if(FP_WIDTH <= SRC_WIDTH) begin
+        local_is_boxed[fmt][0] = '1;
+        local_is_boxed[fmt][1] = '1;
+        local_is_boxed[fmt][2] = '1;
+        local_is_boxed[fmt][3] = '1;
+      end
+      local_is_boxed[fmt][4] = is_boxed_i[dst_fmt_i][2];
+    end
+  end
+
+  fpnew_sdotp_multi #(
+    .SrcDotpFpFmtConfig ( FpSrcFmtConfig ), // FP8, FP8ALT, FP16, FP16ALT
+    .DstDotpFpFmtConfig ( FpDstFmtConfig ), // FP32, FP16, FP16ALT
+    .NumPipeRegs        ( NumPipeRegs    ),
+    .PipeConfig         ( PipeConfig     ),
+    .TagType            ( TagType        ),
+    .AuxType            ( AuxType        ),
+    .StochasticRndImplementation ( StochasticRndImplementation )
+  ) i_fpnew_sdotp_multi (
+    .clk_i,
+    .rst_ni,
+    .sdotp_hart_id_i,
+    .operand_a_i     ( local_src_fmt_operand_a[src_fmt_i] ),
+    .operand_b_i     ( local_src_fmt_operand_b[src_fmt_i] ),
+    .operand_c_i     ( local_src_fmt_operand_c[src_fmt_i] ),
+    .operand_d_i     ( local_src_fmt_operand_d[src_fmt_i] ),
+    .dst_operands_i  ( local_dst_fmt_operands             ), // 1 operand
+    .is_boxed_i      ( local_is_boxed                     ),
+    .rnd_mode_i,
+    .op_i,
+    .op_mod_i,
+    .src_fmt_i, // format of the multiplicands
+    .dst_fmt_i, // format of the addend and result
+    .tag_i,
+    .mask_i,
+    .aux_i,
+    .in_valid_i,
+    .in_ready_o ,
+    .flush_i,
+    .result_o        ( local_result[DST_WIDTH-1:0] ),
+    .status_o,
+    .extension_bit_o,
+    .tag_o,
+    .mask_o,
+    .aux_o,
+    .out_valid_o,
+    .out_ready_i,
+    .busy_o
+  );
+
+  if(OPERAND_WIDTH > DST_WIDTH) begin
+   assign local_result[OPERAND_WIDTH-1:DST_WIDTH]  = '1;
+  end
+  assign result_o                              = local_result;
+
+endmodule
diff --git a/src/fpnew_top.sv b/src/fpnew_top.sv
index a6ff89a6..2c4e2b3b 100644
--- a/src/fpnew_top.sv
+++ b/src/fpnew_top.sv
@@ -22,6 +22,8 @@ module fpnew_top #(
   parameter type                            TagType        = logic,
   parameter int unsigned                    TrueSIMDClass  = 0,
   parameter int unsigned                    EnableSIMDMask = 0,
+  parameter logic                           CompressedVecCmpResult = 0, // conceived for RV32FD cores
+  parameter fpnew_pkg::rsr_impl_t           StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
   // Do not change
   localparam int unsigned NumLanes     = fpnew_pkg::max_num_lanes(Features.Width, Features.FpFmtMask, Features.EnableVectors),
   localparam type         MaskType     = logic [NumLanes-1:0],
@@ -30,6 +32,7 @@ module fpnew_top #(
 ) (
   input logic                               clk_i,
   input logic                               rst_ni,
+  input logic [31:0]                        hart_id_i,
   // Input signals
   input logic [NUM_OPERANDS-1:0][WIDTH-1:0] operands_i,
   input fpnew_pkg::roundmode_e              rnd_mode_i,
@@ -126,10 +129,13 @@ module fpnew_top #(
       .FmtUnitTypes  ( Implementation.UnitTypes[opgrp] ),
       .PipeConfig    ( Implementation.PipeConfig       ),
       .TagType       ( TagType                         ),
-      .TrueSIMDClass ( TrueSIMDClass                   )
+      .TrueSIMDClass ( TrueSIMDClass                   ),
+      .CompressedVecCmpResult ( CompressedVecCmpResult ),
+      .StochasticRndImplementation ( StochasticRndImplementation )
     ) i_opgroup_block (
       .clk_i,
       .rst_ni,
+      .hart_id_i,
       .operands_i      ( operands_i[NUM_OPS-1:0] ),
       .is_boxed_i      ( input_boxed             ),
       .rnd_mode_i,
diff --git a/src/lfsr_sr.sv b/src/lfsr_sr.sv
new file mode 100644
index 00000000..9366a2e8
--- /dev/null
+++ b/src/lfsr_sr.sv
@@ -0,0 +1,352 @@
+// Copyright 2019 ETH Zurich and University of Bologna.
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License.  You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich
+// Date: 26.04.2019
+//
+// Description: This is a parametric LFSR with precomputed coefficients for
+// LFSR lengths from 4 to 64bit.
+
+// Additional block cipher layers can be instantiated to non-linearly transform
+// the pseudo-random LFSR sequence at the output, and hence break the shifting
+// patterns. The additional cipher layers can only be used for an LFSR width
+// of 64bit, since the block cipher has been designed for that block length.
+
+module lfsr_sr #(
+  parameter int unsigned          LfsrWidth     = 64,   // [4,64]
+  parameter int unsigned          OutWidth      = 8,    // [1,LfsrWidth]
+  parameter logic [LfsrWidth-1:0] RstVal        = '1,   // [1,2^LfsrWidth-1]
+  // 0: disabled, the present cipher uses 31, but just a few layers (1-3) are enough
+  // to break linear shifting patterns
+  parameter int unsigned          CipherLayers  = 0,
+  parameter bit                   CipherReg     = 1'b1  // additional output reg after cipher
+) (
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+  input  logic [33:0]          id_i,
+  input  logic                 en_i,
+  output logic [OutWidth-1:0]  out_o
+);
+
+// Galois LFSR feedback masks
+// Automatically generated with get_lfsr_masks.py
+// Masks are from https://users.ece.cmu.edu/~koopman/lfsr/
+localparam logic [63:0] Masks [4:64] = '{64'hC,
+                                         64'h1E,
+                                         64'h39,
+                                         64'h7E,
+                                         64'hFA,
+                                         64'h1FD,
+                                         64'h3FC,
+                                         64'h64B,
+                                         64'hD8F,
+                                         64'h1296,
+                                         64'h2496,
+                                         64'h4357,
+                                         64'h8679,
+                                         64'h1030E,
+                                         64'h206CD,
+                                         64'h403FE,
+                                         64'h807B8,
+                                         64'h1004B2,
+                                         64'h2006A8,
+                                         64'h4004B2,
+                                         64'h800B87,
+                                         64'h10004F3,
+                                         64'h200072D,
+                                         64'h40006AE,
+                                         64'h80009E3,
+                                         64'h10000583,
+                                         64'h20000C92,
+                                         64'h400005B6,
+                                         64'h80000EA6,
+                                         64'h1000007A3,
+                                         64'h200000ABF,
+                                         64'h400000842,
+                                         64'h80000123E,
+                                         64'h100000074E,
+                                         64'h2000000AE9,
+                                         64'h400000086A,
+                                         64'h8000001213,
+                                         64'h1000000077E,
+                                         64'h2000000123B,
+                                         64'h40000000877,
+                                         64'h8000000108D,
+                                         64'h100000000AE9,
+                                         64'h200000000E9F,
+                                         64'h4000000008A6,
+                                         64'h80000000191E,
+                                         64'h100000000090E,
+                                         64'h2000000000FB3,
+                                         64'h4000000000D7D,
+                                         64'h80000000016A5,
+                                         64'h10000000000B4B,
+                                         64'h200000000010AF,
+                                         64'h40000000000DDE,
+                                         64'h8000000000181A,
+                                         64'h100000000000B65,
+                                         64'h20000000000102D,
+                                         64'h400000000000CD5,
+                                         64'h8000000000024C1,
+                                         64'h1000000000000EF6,
+                                         64'h2000000000001363,
+                                         64'h4000000000000FCD,
+                                         64'h80000000000019E2};
+
+// this S-box and permutation P has been taken from the Present Cipher,
+// a super lightweight block cipher. use the cipher layers to add additional
+// non-linearity to the LFSR output. note one layer does not fully correspond
+// to the present cipher round, since the key and rekeying function is not applied here.
+//
+// See also:
+// "PRESENT: An Ultra-Lightweight Block Cipher", A. Bogdanov et al., Ches 2007
+// http://www.lightweightcrypto.org/present/present_ches2007.pdf
+
+// this is the sbox from the present cipher
+localparam logic[15:0][3:0] Sbox4 = {4'h2, 4'h1, 4'h7, 4'h4,
+                                     4'h8, 4'hF, 4'hE, 4'h3,
+                                     4'hD, 4'hA, 4'h0, 4'h9,
+                                     4'hB, 4'h6, 4'h5, 4'hC };
+
+// these are the permutation indices of the present cipher
+localparam logic[63:0][5:0] Perm = {6'd63, 6'd47, 6'd31, 6'd15, 6'd62, 6'd46, 6'd30, 6'd14,
+                                    6'd61, 6'd45, 6'd29, 6'd13, 6'd60, 6'd44, 6'd28, 6'd12,
+                                    6'd59, 6'd43, 6'd27, 6'd11, 6'd58, 6'd42, 6'd26, 6'd10,
+                                    6'd57, 6'd41, 6'd25, 6'd09, 6'd56, 6'd40, 6'd24, 6'd08,
+                                    6'd55, 6'd39, 6'd23, 6'd07, 6'd54, 6'd38, 6'd22, 6'd06,
+                                    6'd53, 6'd37, 6'd21, 6'd05, 6'd52, 6'd36, 6'd20, 6'd04,
+                                    6'd51, 6'd35, 6'd19, 6'd03, 6'd50, 6'd34, 6'd18, 6'd02,
+                                    6'd49, 6'd33, 6'd17, 6'd01, 6'd48, 6'd32, 6'd16, 6'd00};
+
+
+function automatic logic [63:0] sbox4_layer(logic [63:0] in);
+  logic [63:0] out;
+  //for (logic [4:0] j = '0; j<16; j++) out[j*4 +: 4] = sbox4[in[j*4 +: 4]];
+  // this simulates much faster than the loop
+  out[0*4  +: 4] = Sbox4[in[0*4  +: 4]];
+  out[1*4  +: 4] = Sbox4[in[1*4  +: 4]];
+  out[2*4  +: 4] = Sbox4[in[2*4  +: 4]];
+  out[3*4  +: 4] = Sbox4[in[3*4  +: 4]];
+
+  out[4*4  +: 4] = Sbox4[in[4*4  +: 4]];
+  out[5*4  +: 4] = Sbox4[in[5*4  +: 4]];
+  out[6*4  +: 4] = Sbox4[in[6*4  +: 4]];
+  out[7*4  +: 4] = Sbox4[in[7*4  +: 4]];
+
+  out[8*4  +: 4] = Sbox4[in[8*4  +: 4]];
+  out[9*4  +: 4] = Sbox4[in[9*4  +: 4]];
+  out[10*4 +: 4] = Sbox4[in[10*4 +: 4]];
+  out[11*4 +: 4] = Sbox4[in[11*4 +: 4]];
+
+  out[12*4 +: 4] = Sbox4[in[12*4 +: 4]];
+  out[13*4 +: 4] = Sbox4[in[13*4 +: 4]];
+  out[14*4 +: 4] = Sbox4[in[14*4 +: 4]];
+  out[15*4 +: 4] = Sbox4[in[15*4 +: 4]];
+  return out;
+endfunction : sbox4_layer
+
+function automatic logic [63:0] perm_layer(logic [63:0] in);
+  logic [63:0] out;
+  // for (logic [7:0] j = '0; j<64; j++) out[perm[j]] = in[j];
+  // this simulates much faster than the loop
+  out[Perm[0]] = in[0];
+  out[Perm[1]] = in[1];
+  out[Perm[2]] = in[2];
+  out[Perm[3]] = in[3];
+  out[Perm[4]] = in[4];
+  out[Perm[5]] = in[5];
+  out[Perm[6]] = in[6];
+  out[Perm[7]] = in[7];
+  out[Perm[8]] = in[8];
+  out[Perm[9]] = in[9];
+
+  out[Perm[10]] = in[10];
+  out[Perm[11]] = in[11];
+  out[Perm[12]] = in[12];
+  out[Perm[13]] = in[13];
+  out[Perm[14]] = in[14];
+  out[Perm[15]] = in[15];
+  out[Perm[16]] = in[16];
+  out[Perm[17]] = in[17];
+  out[Perm[18]] = in[18];
+  out[Perm[19]] = in[19];
+
+  out[Perm[20]] = in[20];
+  out[Perm[21]] = in[21];
+  out[Perm[22]] = in[22];
+  out[Perm[23]] = in[23];
+  out[Perm[24]] = in[24];
+  out[Perm[25]] = in[25];
+  out[Perm[26]] = in[26];
+  out[Perm[27]] = in[27];
+  out[Perm[28]] = in[28];
+  out[Perm[29]] = in[29];
+
+  out[Perm[30]] = in[30];
+  out[Perm[31]] = in[31];
+  out[Perm[32]] = in[32];
+  out[Perm[33]] = in[33];
+  out[Perm[34]] = in[34];
+  out[Perm[35]] = in[35];
+  out[Perm[36]] = in[36];
+  out[Perm[37]] = in[37];
+  out[Perm[38]] = in[38];
+  out[Perm[39]] = in[39];
+
+  out[Perm[40]] = in[40];
+  out[Perm[41]] = in[41];
+  out[Perm[42]] = in[42];
+  out[Perm[43]] = in[43];
+  out[Perm[44]] = in[44];
+  out[Perm[45]] = in[45];
+  out[Perm[46]] = in[46];
+  out[Perm[47]] = in[47];
+  out[Perm[48]] = in[48];
+  out[Perm[49]] = in[49];
+
+  out[Perm[50]] = in[50];
+  out[Perm[51]] = in[51];
+  out[Perm[52]] = in[52];
+  out[Perm[53]] = in[53];
+  out[Perm[54]] = in[54];
+  out[Perm[55]] = in[55];
+  out[Perm[56]] = in[56];
+  out[Perm[57]] = in[57];
+  out[Perm[58]] = in[58];
+  out[Perm[59]] = in[59];
+
+  out[Perm[60]] = in[60];
+  out[Perm[61]] = in[61];
+  out[Perm[62]] = in[62];
+  out[Perm[63]] = in[63];
+  return out;
+endfunction : perm_layer
+
+////////////////////////////////////////////////////////////////////////
+// lfsr
+////////////////////////////////////////////////////////////////////////
+
+logic [LfsrWidth-1:0] lfsr_d, lfsr_q;
+
+always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
+  if (!rst_ni) begin
+    lfsr_q <= LfsrWidth'(RstVal);
+  end else begin
+    lfsr_q <= lfsr_d;
+  end
+end
+
+typedef enum logic [1:0] {
+    FSM_IDLE,
+    FSM_INITIALIZE,
+    FSM_COMPUTE
+  } state_fsm_t;
+
+state_fsm_t curr_state, next_state;
+
+always_ff @(posedge clk_i or negedge rst_ni) begin : main_fsm_seq
+  if(~rst_ni) begin
+    curr_state <= FSM_IDLE;
+  end else begin
+    curr_state <= next_state;
+  end
+end
+
+always_comb begin
+  next_state = curr_state;
+  lfsr_d = lfsr_q;
+  case (curr_state)
+    FSM_IDLE: begin
+      next_state = FSM_INITIALIZE;
+    end
+    FSM_INITIALIZE: begin
+      lfsr_d = ~id_i[LfsrWidth-1:0];
+      next_state = FSM_COMPUTE;
+    end
+    FSM_COMPUTE: begin
+      if (en_i) begin
+        lfsr_d = (lfsr_q>>1) ^ ({LfsrWidth{lfsr_q[0]}} & Masks[LfsrWidth][LfsrWidth-1:0]);
+      end else begin
+        lfsr_d = lfsr_q;
+      end
+      next_state = FSM_COMPUTE;
+    end
+    default : next_state = FSM_COMPUTE;
+  endcase
+end
+
+////////////////////////////////////////////////////////////////////////
+// block cipher layers
+////////////////////////////////////////////////////////////////////////
+
+if (CipherLayers > unsigned'(0)) begin : g_cipher_layers
+  logic [63:0] ciph_layer;
+  localparam int unsigned NumRepl = ((64+LfsrWidth)/LfsrWidth);
+
+  always_comb begin : p_ciph_layer
+    automatic logic [63:0] tmp;
+    tmp = 64'({NumRepl{lfsr_q}});
+    for(int unsigned k = 0; k < CipherLayers; k++) begin
+      tmp = perm_layer(sbox4_layer(tmp));
+    end
+    ciph_layer = tmp;
+  end
+
+  // additiona output reg after cipher
+  if (CipherReg) begin : g_cipher_reg
+    logic [OutWidth-1:0] out_d, out_q;
+
+    assign out_d = (en_i) ? ciph_layer[OutWidth-1:0] : out_q;
+    assign out_o = out_q[OutWidth-1:0];
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
+      if (!rst_ni) begin
+        out_q <= '0;
+      end else begin
+        out_q <= out_d;
+      end
+    end
+  // no outreg
+  end else begin : g_no_out_reg
+    assign out_o  = ciph_layer[OutWidth-1:0];
+  end
+
+// no block cipher
+end else begin : g_no_cipher_layers
+  assign out_o    = lfsr_q[OutWidth-1:0];
+end
+
+////////////////////////////////////////////////////////////////////////
+// assertions
+////////////////////////////////////////////////////////////////////////
+
+// pragma translate_off
+initial begin
+  // these are the LUT limits
+  assert(OutWidth <= LfsrWidth) else
+    $fatal(1,"OutWidth must be smaller equal the LfsrWidth.");
+  assert(RstVal > unsigned'(0)) else
+    $fatal(1,"RstVal must be nonzero.");
+  assert((LfsrWidth >= $low(Masks)) && (LfsrWidth <= $high(Masks))) else
+    $fatal(1,"Unsupported LfsrWidth.");
+  assert(Masks[LfsrWidth][LfsrWidth-1]) else
+    $fatal(1, "LFSR mask is not correct. The MSB must be 1." );
+  assert((CipherLayers > 0) && (LfsrWidth == 64) || (CipherLayers == 0)) else
+    $fatal(1, "Use additional cipher layers only in conjunction with an LFSR width of 64 bit." );
+end
+
+`ifndef VERILATOR
+  all_zero: assert property (
+    @(posedge clk_i) disable iff (!rst_ni) en_i |-> lfsr_d)
+      else $fatal(1,"Lfsr must not be all-zero.");
+`endif
+// pragma translate_on
+
+endmodule // lfsr
diff --git a/src_files.yml b/src_files.yml
index 6f8aedd5..eaf51dd0 100644
--- a/src_files.yml
+++ b/src_files.yml
@@ -25,6 +25,8 @@ fpnew:
     src/fpnew_divsqrt_multi.sv,
     src/fpnew_fma.sv,
     src/fpnew_fma_multi.sv,
+    src/fpnew_sdotp_multi.sv,
+    src/fpnew_sdotp_multi_wrapper.sv,
     src/fpnew_noncomp.sv,
     src/fpnew_opgroup_block.sv,
     src/fpnew_opgroup_fmt_slice.sv,