diff --git a/src/fpnew_divsqrt_multi.sv b/src/fpnew_divsqrt_multi.sv index ac23c43e..44a030a1 100644 --- a/src/fpnew_divsqrt_multi.sv +++ b/src/fpnew_divsqrt_multi.sv @@ -41,10 +41,6 @@ module fpnew_divsqrt_multi #( // Input Handshake input logic in_valid_i, output logic in_ready_o, - output logic divsqrt_done_o, - input logic simd_synch_done_i, - output logic divsqrt_ready_o, - input logic simd_synch_rdy_i, input logic flush_i, // Output signals output logic [WIDTH-1:0] result_o, @@ -170,11 +166,10 @@ module fpnew_divsqrt_multi #( logic in_ready; // input handshake with upstream logic div_valid, sqrt_valid; // input signalling with unit - logic unit_ready, unit_done, unit_done_q; // status signals from unit instance + logic unit_ready, unit_done; // status signals from unit instance logic op_starting; // high in the cycle a new operation starts logic out_valid, out_ready; // output handshake with downstream logic unit_busy; // valid data in flight - logic simd_synch_done; // FSM states typedef enum logic [1:0] {IDLE, BUSY, HOLD} fsm_state_e; fsm_state_e state_q, state_d; @@ -198,21 +193,8 @@ module fpnew_divsqrt_multi #( `FFL(result_aux_q, inp_pipe_aux_q[NUM_INP_REGS], op_starting, '0) `FFL(result_vec_op_q, inp_pipe_vec_op_q[NUM_INP_REGS], op_starting, '0) - // Wait for other lanes only if the operation is vectorial - assign simd_synch_done = simd_synch_done_i || ~result_vec_op_q; - - // Valid synch with other lanes - // When one divsqrt unit completes an operation, keep its done high, waiting for the other lanes - // As soon as all the lanes are over, we can clear this FF and start with a new operation - `FFLARNC(unit_done_q, unit_done, unit_done, simd_synch_done, 1'b0, clk_i, rst_ni); - // Tell the other units that this unit has finished now or in the past - assign divsqrt_done_o = (unit_done_q | unit_done) & result_vec_op_q; - - // Ready synch with other lanes - // Bring the FSM-generated ready outside the unit, to synchronize it with the other lanes - assign divsqrt_ready_o = in_ready; - // Upstream ready comes from sanitization FSM, and it is synched among all the lanes - assign inp_pipe_ready[NUM_INP_REGS] = result_vec_op_q ? simd_synch_rdy_i : in_ready; + // Upstream ready comes from FSM + assign inp_pipe_ready[NUM_INP_REGS] = in_ready; // FSM to safely apply and receive data from DIVSQRT unit always_comb begin : flag_fsm @@ -234,7 +216,7 @@ module fpnew_divsqrt_multi #( BUSY: begin unit_busy = 1'b1; // data in flight // If all the lanes are done with processing - if (simd_synch_done_i || (~result_vec_op_q && unit_done)) begin + if (unit_done) begin out_valid = 1'b1; // try to commit result downstream // If downstream accepts our result if (out_ready) begin @@ -305,22 +287,6 @@ module fpnew_divsqrt_multi #( // Adjust result width and fix FP8 assign adjusted_result = result_is_fp8_q ? unit_result >> 8 : unit_result; - // Hold the result when one lane has finished execution, except when all the lanes finish together, - // or the operation is not vectorial, and the result can be accepted downstream - assign hold_en = unit_done & (~simd_synch_done_i | ~out_ready) & ~(~result_vec_op_q & out_ready); - // The Hold register (load, no reset) - `FFLNR(held_result_q, adjusted_result, hold_en, clk_i) - `FFLNR(held_status_q, unit_status, hold_en, clk_i) - - // -------------- - // Output Select - // -------------- - logic [WIDTH-1:0] result_d; - fpnew_pkg::status_t status_d; - // Prioritize hold register data - assign result_d = unit_done_q ? held_result_q : adjusted_result; - assign status_d = unit_done_q ? held_status_q : unit_status; - // ---------------- // Output Pipeline // ---------------- @@ -335,8 +301,8 @@ module fpnew_divsqrt_multi #( logic [0:NUM_OUT_REGS] out_pipe_ready; // Input stage: First element of pipeline is taken from inputs - assign out_pipe_result_q[0] = result_d; - assign out_pipe_status_q[0] = status_d; + assign out_pipe_result_q[0] = adjusted_result; + assign out_pipe_status_q[0] = unit_status; assign out_pipe_tag_q[0] = result_tag_q; assign out_pipe_mask_q[0] = result_mask_q; assign out_pipe_aux_q[0] = result_aux_q; diff --git a/src/fpnew_divsqrt_th_64_multi.sv b/src/fpnew_divsqrt_th_64_multi.sv index eff0620d..df781554 100644 --- a/src/fpnew_divsqrt_th_64_multi.sv +++ b/src/fpnew_divsqrt_th_64_multi.sv @@ -39,14 +39,9 @@ module fpnew_divsqrt_th_64_multi #( input TagType tag_i, input logic mask_i, input AuxType aux_i, - input logic vectorial_op_i, // Input Handshake input logic in_valid_i, output logic in_ready_o, - output logic divsqrt_done_o, - input logic simd_synch_done_i, - output logic divsqrt_ready_o, - input logic simd_synch_rdy_i, input logic flush_i, // Output signals output logic [WIDTH-1:0] result_o, @@ -95,7 +90,6 @@ module fpnew_divsqrt_th_64_multi #( TagType [0:NUM_INP_REGS] inp_pipe_tag_q; logic [0:NUM_INP_REGS] inp_pipe_mask_q; AuxType [0:NUM_INP_REGS] inp_pipe_aux_q; - logic [0:NUM_INP_REGS] inp_pipe_vec_op_q; logic [0:NUM_INP_REGS] inp_pipe_valid_q; // Ready signal is combinatorial for all stages logic [0:NUM_INP_REGS] inp_pipe_ready; @@ -108,7 +102,6 @@ module fpnew_divsqrt_th_64_multi #( assign inp_pipe_tag_q[0] = tag_i; assign inp_pipe_mask_q[0] = mask_i; assign inp_pipe_aux_q[0] = aux_i; - assign inp_pipe_vec_op_q[0] = vectorial_op_i; assign inp_pipe_valid_q[0] = in_valid_i; // Input stage: Propagate pipeline ready signal to upstream circuitry assign in_ready_o = inp_pipe_ready[0]; @@ -132,7 +125,6 @@ module fpnew_divsqrt_th_64_multi #( `FFL(inp_pipe_tag_q[i+1], inp_pipe_tag_q[i], reg_ena, TagType'('0)) `FFL(inp_pipe_mask_q[i+1], inp_pipe_mask_q[i], reg_ena, '0) `FFL(inp_pipe_aux_q[i+1], inp_pipe_aux_q[i], reg_ena, AuxType'('0)) - `FFL(inp_pipe_vec_op_q[i+1], inp_pipe_vec_op_q[i], reg_ena, AuxType'('0)) end // Output stage: assign selected pipe outputs to signals for later use assign operands_q = inp_pipe_operands_q[NUM_INP_REGS]; @@ -181,11 +173,11 @@ module fpnew_divsqrt_th_64_multi #( logic in_ready; // input handshake with upstream logic div_valid, sqrt_valid; // input signalling with unit - logic unit_ready, unit_done, unit_done_q; // status signals from unit instance + logic unit_ready, unit_done; // status signals from unit instance logic op_starting; // high in the cycle a new operation starts logic out_valid, out_ready; // output handshake with downstream logic unit_busy; // valid data in flight - logic simd_synch_done; + // FSM states typedef enum logic [1:0] {IDLE, BUSY, HOLD} fsm_state_e; fsm_state_e state_q, state_d; @@ -200,29 +192,13 @@ module fpnew_divsqrt_th_64_multi #( TagType result_tag_q; logic result_mask_q; AuxType result_aux_q; - logic result_vec_op_q; // Fill the registers everytime a valid operation arrives (load FF, active low asynch rst) `FFL(result_tag_q, inp_pipe_tag_q[NUM_INP_REGS], op_starting, '0) `FFL(result_mask_q, inp_pipe_mask_q[NUM_INP_REGS],op_starting, '0) `FFL(result_aux_q, inp_pipe_aux_q[NUM_INP_REGS], op_starting, '0) - `FFL(result_vec_op_q, inp_pipe_vec_op_q[NUM_INP_REGS], op_starting, '0) - - // Wait for other lanes only if the operation is vectorial - assign simd_synch_done = simd_synch_done_i || ~result_vec_op_q; - - // Valid synch with other lanes - // When one divsqrt unit completes an operation, keep its done high, waiting for the other lanes - // As soon as all the lanes are over, we can clear this FF and start with a new operation - `FFLARNC(unit_done_q, unit_done, unit_done, simd_synch_done, 1'b0, clk_i, rst_ni); - // Tell the other units that this unit has finished now or in the past - assign divsqrt_done_o = (unit_done_q | unit_done) & result_vec_op_q; - // Ready synch with other lanes - // Bring the FSM-generated ready outside the unit, to synchronize it with the other lanes - assign divsqrt_ready_o = in_ready; - // Upstream ready comes from sanitization FSM, and it is synched among all the lanes - assign inp_pipe_ready[NUM_INP_REGS] = result_vec_op_q ? simd_synch_rdy_i : in_ready; + assign inp_pipe_ready[NUM_INP_REGS] = in_ready; // FSM to safely apply and receive data from DIVSQRT unit always_comb begin : flag_fsm @@ -244,7 +220,7 @@ module fpnew_divsqrt_th_64_multi #( BUSY: begin unit_busy = 1'b1; // data in flight // If all the lanes are done with processing - if (simd_synch_done_i || (~result_vec_op_q && unit_done)) begin + if (unit_done) begin out_valid = 1'b1; // try to commit result downstream // If downstream accepts our result if (out_ready) begin @@ -410,22 +386,6 @@ module fpnew_divsqrt_th_64_multi #( assign unit_ready = !vfdsu_dp_fdiv_busy; - // Hold the result when one lane has finished execution, except when all the lanes finish together, - // or the operation is not vectorial, and the result can be accepted downstream - assign hold_en = unit_done & (~simd_synch_done_i | ~out_ready) & ~(~result_vec_op_q & out_ready); - // The Hold register (load, no reset) - `FFLNR(held_result_q, unit_result, hold_en, clk_i) - `FFLNR(held_status_q, unit_status, hold_en, clk_i) - - // -------------- - // Output Select - // -------------- - logic [WIDTH-1:0] result_d; - fpnew_pkg::status_t status_d; - // Prioritize hold register data - assign result_d[WIDTH-1:0] = unit_done_q ? held_result_q[WIDTH-1:0] : unit_result[WIDTH-1:0]; - assign status_d = unit_done_q ? held_status_q : unit_status; - // ---------------- // Output Pipeline // ---------------- @@ -440,8 +400,8 @@ module fpnew_divsqrt_th_64_multi #( logic [0:NUM_OUT_REGS] out_pipe_ready; // Input stage: First element of pipeline is taken from inputs - assign out_pipe_result_q[0] = result_d; - assign out_pipe_status_q[0] = status_d; + assign out_pipe_result_q[0] = unit_result; + assign out_pipe_status_q[0] = unit_status; assign out_pipe_tag_q[0] = result_tag_q; assign out_pipe_mask_q[0] = result_mask_q; assign out_pipe_aux_q[0] = result_aux_q; diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv index e3974bea..e99bb3af 100644 --- a/src/fpnew_opgroup_multifmt_slice.sv +++ b/src/fpnew_opgroup_multifmt_slice.sv @@ -119,7 +119,6 @@ or on 16b inputs producing 32b outputs"); logic result_fmt_is_int, result_is_cpk; logic [1:0] result_vec_op; // info for vectorial results (for packing) - logic simd_synch_rdy, simd_synch_done; fpnew_pkg::roundmode_e rnd_mode; // ----------- @@ -171,13 +170,13 @@ or on 16b inputs producing 32b outputs"); // ------------ if (OpGroup == fpnew_pkg::DIVSQRT) begin: gen_reduced_throughput_lanes // Reduced throughput specific lane signals - logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid, divsqrt_done, divsqrt_ready; // Handshake signals for the lanes + logic [NUM_LANES-1:0] lane_in_ready, lane_out_valid; // Handshake signals for the lanes TagType [NUM_LANES-1:0] lane_tags; // only the first one is actually used logic [NUM_LANES-1:0][AUX_BITS-1:0] lane_aux; // only the first one is actually used logic [NUM_LANES-1:0] lane_busy; // dito // Input side - assign in_ready_o = lane_in_ready[0]; // Upstream ready is given by first lane + assign in_ready_o = vectorial_op ? &lane_in_ready : lane_in_ready[0]; // Upstream ready is given all lanes if vectorial // --------------- // Generate Lanes @@ -218,7 +217,7 @@ or on 16b inputs producing 32b outputs"); logic [LANE_WIDTH-1:0] op_result; // lane-local results fpnew_pkg::status_t op_status; - assign in_valid = in_valid_i & ((lane == 0) | vectorial_op); // upper lanes only for vectors + assign in_valid = in_valid_i & ((lane == 0) | vectorial_op) & in_ready_o; // upper lanes only for vectors // Slice out the operands for this lane, upper bits are ignored in the unit always_comb begin : prepare_input @@ -277,13 +276,8 @@ or on 16b inputs producing 32b outputs"); .tag_i, .mask_i ( simd_mask_i[lane] ), .aux_i ( in_aux ), - .vectorial_op_i ( vectorial_op ), // synchronize only vectorial operations .in_valid_i ( in_valid ), .in_ready_o ( lane_in_ready[lane] ), - .divsqrt_done_o ( divsqrt_done[lane] ), - .simd_synch_done_i( simd_synch_done ), - .divsqrt_ready_o ( divsqrt_ready[lane] ), - .simd_synch_rdy_i ( simd_synch_rdy ), .flush_i, .result_o ( op_result ), .status_o ( op_status ), @@ -313,13 +307,9 @@ or on 16b inputs producing 32b outputs"); .tag_i, .mask_i ( simd_mask_i[lane] ), .aux_i ( in_aux ), - .vectorial_op_i ( vectorial_op ), // synchronize only vectorial operations .in_valid_i ( in_valid ), .in_ready_o ( lane_in_ready[lane] ), - .divsqrt_done_o ( divsqrt_done[lane] ), - .simd_synch_done_i( simd_synch_done ), - .divsqrt_ready_o ( divsqrt_ready[lane] ), - .simd_synch_rdy_i ( simd_synch_rdy ), + .flush_i, .result_o ( op_result ), .status_o ( op_status ), @@ -349,8 +339,6 @@ or on 16b inputs producing 32b outputs"); assign lane_in_ready[lane] = 1'b0; // unused lane assign lane_aux[lane] = 1'b0; // unused lane assign lane_tags[lane] = 1'b0; // unused lane - assign divsqrt_done[lane] = 1'b0; // unused lane - assign divsqrt_ready[lane] = 1'b0; // unused lane assign lane_busy[lane] = 1'b0; // Signals in any kind of laned instance @@ -390,21 +378,11 @@ or on 16b inputs producing 32b outputs"); assign ifmt_slice_result[ifmt] = '0; end - if ((DivSqrtSel != fpnew_pkg::TH32) && (OpGroup == fpnew_pkg::DIVSQRT)) begin - // Synch lanes if there is more than one - assign simd_synch_rdy = EnableVectors ? &divsqrt_ready[NUM_DIVSQRT_LANES-1:0] : divsqrt_ready[0]; - assign simd_synch_done = EnableVectors ? &divsqrt_done[NUM_DIVSQRT_LANES-1:0] : divsqrt_done[0]; - end else begin - // Unused (TH32 divider only supported for scalar FP32 divsqrt) - assign simd_synch_rdy = '0; - assign simd_synch_done = '0; - end - // Group signals from all lanes assign extension_bit_o = lane_ext_bit[0]; // don't care about upper ones assign tag_o = lane_tags[0]; // don't care about upper ones assign busy_o = lane_busy[0]; - assign out_valid_o = lane_out_valid[0]; // don't care about upper ones + assign out_valid_o = result_is_vector ? &lane_out_valid : lane_out_valid[0]; // Only care about upper ones if vectorial assign out_aux = lane_aux[0]; // don't care about upper ones // Lane is always non_conv