Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FP8 and FP8ALT support to THMULTI DivSqrt #135

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Bender.yml
Original file line number Diff line number Diff line change
@@ -37,6 +37,11 @@ sources:
- vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v
- vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
- vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
- src/fpnew_lut_div8.sv
- src/fpnew_lut_div8alt.sv
- src/fpnew_lut_sqrt8.sv
- src/fpnew_lut_sqrt8alt.sv
- src/fpnew_divsqrt_8_multi_lut.sv
- src/fpnew_divsqrt_th_32.sv
- src/fpnew_divsqrt_th_64_multi.sv
- src/fpnew_divsqrt_multi.sv
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -38,8 +38,8 @@ E.g.: Support for double-precision (64bit) operations and two simultaneous singl

It is also possible to generate only a subset of operations if e.g. divisions are not needed.

<sup>1</sup>Some compliance issues with IEEE 754-2008 are currently known to exist for the PULP DivSqrt unit (Rounding mismatches have been reported in GitHub issues. This can lead to results being off by 1ulp, and the inexact flag not being properly raised in these cases as well)<br>
<sup>2</sup>Two DivSqrt units are supported: the multi-format PULP DivSqrt unit and a 32-bit unit integrated from the T-Head OpenE906. The `PulpDivsqrt` parameter can be set to 1 or 0 to select the former or the latter unit, respectively.<br>
<sup>2</sup>Three DivSqrt units are supported: a multi-format 64-bit unit integrated from the T-Head OpenC910, the multi-format PULP DivSqrt unit, a 32-bit unit integrated from the T-Head OpenE906. The `DivSqrtSel` parameter can be set to `THMULTI`, `PULP`, `TH32`. `THMULTI` (the default) supports SIMD operations and leverages the unit integrated from the T-Head OpenC910 extended for FP16ALT, FP8, and FP8ALT support (thus supporting FP64, FP32, FP16, FP16ALT, FP8, and FP8ALT). `PULP` supports SIMD operations and selects the multi-format PULP DivSqrt unit (supporting FP64, FP32, FP16, FP16ALT, and FP8). `TH32` selects the 32-bit unit from OpenE906 supporting only FP32.<br>
<sup>1</sup>Some compliance issues with IEEE 754-2008 are currently known to exist for the PULP DivSqrt unit (Rounding mismatches have been reported in GitHub issues. This can lead to results being off by 1ulp, and the inexact flag not being properly raised in these cases as well).<br>
<sup>3</sup>Implementing IEEE 754-201x `minimumNumber` and `maximumNumber`, respectively

### Rounding modes
5 changes: 3 additions & 2 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -134,10 +134,11 @@ Enumeration of type `logic [2:0]` holding the supported FP formats.
| `FP16` | IEEE binary16 | 16 bit | 5 | 10 |
| `FP8` | binary8 | 8 bit | 5 | 2 |
| `FP16ALT` | binary16alt | 16 bit | 8 | 7 |
| `FP8ALT` | binary8alt | 8 bit | 4 | 3 |

The following global parameters associated with FP formats are set in `fpnew_pkg`:
```SystemVerilog
localparam int unsigned NUM_FP_FORMATS = 5;
localparam int unsigned NUM_FP_FORMATS = 6;
localparam int unsigned FP_FORMAT_BITS = $clog2(NUM_FP_FORMATS);
```

@@ -359,7 +360,7 @@ It is of type `divsqrt_unit_t`, which is defined as:
typedef enum logic[1:0] {
PULP, // "PULP" instantiates the PULP DivSqrt unit supports FP64, FP32, FP16, FP16ALT, FP8 and SIMD operations
TH32, // "TH32" instantiates the E906 DivSqrt unit supports only FP32 (no SIMD support)
THMULTI // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16, FP16ALT and SIMD operations
THMULTI // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16, FP16ALT, FP8, and FP8ALT and SIMD operations
} divsqrt_unit_t;
```

285 changes: 285 additions & 0 deletions src/fpnew_divsqrt_8_multi_lut.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
// Copyright 2024 ETH Zurich and University of Bologna.
//
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// SPDX-License-Identifier: SHL-0.51

// Authors: Luca Bertaccini <[email protected]>
// Stefan Mach <[email protected]>

`include "common_cells/registers.svh"

module fpnew_divsqrt_8_multi_lut #(
// FPU configuration
parameter int unsigned NumPipeRegs = 0,
parameter fpnew_pkg::pipe_config_t PipeConfig = fpnew_pkg::AFTER,
parameter type TagType = logic,
parameter type AuxType = logic,
// Do not change
localparam int unsigned NUM_FORMATS = fpnew_pkg::NUM_FP_FORMATS,
localparam int unsigned ExtRegEnaWidth = NumPipeRegs == 0 ? 1 : NumPipeRegs
) (
input logic clk_i,
input logic rst_ni,
// Input signals
input logic [1:0][7:0] operands_i, // 2 operands
input logic [NUM_FORMATS-1:0][1:0] is_boxed_i, // 2 operands
input fpnew_pkg::roundmode_e rnd_mode_i,
input fpnew_pkg::operation_e op_i,
input fpnew_pkg::fp_format_e dst_fmt_i,
input TagType tag_i,
input logic mask_i,
input AuxType aux_i,
input logic vectorial_op_i,
// Input Handshake
input logic in_valid_i,
output logic in_ready_o,
output logic divsqrt_done_o,
input logic simd_synch_done_i,
output logic divsqrt_ready_o,
input logic simd_synch_rdy_i,
input logic flush_i,
// Output signals
output logic [7:0] result_o,
output fpnew_pkg::status_t status_o,
output logic extension_bit_o,
output TagType tag_o,
output logic mask_o,
output AuxType aux_o,
// Output handshake
output logic out_valid_o,
input logic out_ready_i,
// Indication of valid data in flight
output logic busy_o,
// External register enable override
input logic [ExtRegEnaWidth-1:0] reg_ena_i
);

// ----------
// Constants
// ----------
// Pipelines
localparam NUM_INP_REGS = (PipeConfig == fpnew_pkg::BEFORE)
? NumPipeRegs
: (PipeConfig == fpnew_pkg::DISTRIBUTED
? (NumPipeRegs / 2) // Last to get distributed regs
: 0); // no regs here otherwise
localparam NUM_OUT_REGS = (PipeConfig == fpnew_pkg::AFTER || PipeConfig == fpnew_pkg::INSIDE)
? NumPipeRegs
: (PipeConfig == fpnew_pkg::DISTRIBUTED
? ((NumPipeRegs + 1) / 2) // First to get distributed regs
: 0); // no regs here otherwise

// ---------------
// Input pipeline
// ---------------
// Selected pipeline output signals as non-arrays
logic [1:0][7:0] operands_q;
fpnew_pkg::roundmode_e rnd_mode_q;
fpnew_pkg::operation_e op_q;
fpnew_pkg::fp_format_e dst_fmt_q;

// Input pipeline signals, index i holds signal after i register stages
logic [0:NUM_INP_REGS][1:0][7:0] inp_pipe_operands_q;
fpnew_pkg::roundmode_e [0:NUM_INP_REGS] inp_pipe_rnd_mode_q;
fpnew_pkg::operation_e [0:NUM_INP_REGS] inp_pipe_op_q;
fpnew_pkg::fp_format_e [0:NUM_INP_REGS] inp_pipe_dst_fmt_q;
TagType [0:NUM_INP_REGS] inp_pipe_tag_q;
logic [0:NUM_INP_REGS] inp_pipe_mask_q;
AuxType [0:NUM_INP_REGS] inp_pipe_aux_q;
logic [0:NUM_INP_REGS] inp_pipe_vec_op_q;
logic [0:NUM_INP_REGS] inp_pipe_valid_q;
// Ready signal is combinatorial for all stages
logic [0:NUM_INP_REGS] inp_pipe_ready;

// Input stage: First element of pipeline is taken from inputs
assign inp_pipe_operands_q[0] = operands_i;
assign inp_pipe_rnd_mode_q[0] = rnd_mode_i;
assign inp_pipe_op_q[0] = op_i;
assign inp_pipe_dst_fmt_q[0] = dst_fmt_i;
assign inp_pipe_tag_q[0] = tag_i;
assign inp_pipe_mask_q[0] = mask_i;
assign inp_pipe_aux_q[0] = aux_i;
assign inp_pipe_vec_op_q[0] = vectorial_op_i;
assign inp_pipe_valid_q[0] = in_valid_i;
// Input stage: Propagate pipeline ready signal to upstream circuitry
assign in_ready_o = inp_pipe_ready[0];
// Generate the register stages
for (genvar i = 0; i < NUM_INP_REGS; i++) begin : gen_input_pipeline
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage - advance the pipeline:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign inp_pipe_ready[i] = inp_pipe_ready[i+1] | ~inp_pipe_valid_q[i+1];
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(inp_pipe_valid_q[i+1], inp_pipe_valid_q[i], inp_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipleine ready and a valid data item is present
assign reg_ena = (inp_pipe_ready[i] & inp_pipe_valid_q[i]) | reg_ena_i[i];
// Generate the pipeline registers within the stages, use enable-registers
`FFL(inp_pipe_operands_q[i+1], inp_pipe_operands_q[i], reg_ena, '0)
`FFL(inp_pipe_rnd_mode_q[i+1], inp_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE)
`FFL(inp_pipe_op_q[i+1], inp_pipe_op_q[i], reg_ena, fpnew_pkg::FMADD)
`FFL(inp_pipe_dst_fmt_q[i+1], inp_pipe_dst_fmt_q[i], reg_ena, fpnew_pkg::fp_format_e'(0))
`FFL(inp_pipe_tag_q[i+1], inp_pipe_tag_q[i], reg_ena, TagType'('0))
`FFL(inp_pipe_mask_q[i+1], inp_pipe_mask_q[i], reg_ena, '0)
`FFL(inp_pipe_aux_q[i+1], inp_pipe_aux_q[i], reg_ena, AuxType'('0))
`FFL(inp_pipe_vec_op_q[i+1], inp_pipe_vec_op_q[i], reg_ena, AuxType'('0))
end
// Output stage: assign selected pipe outputs to signals for later use
assign operands_q = inp_pipe_operands_q[NUM_INP_REGS];
assign rnd_mode_q = inp_pipe_rnd_mode_q[NUM_INP_REGS];
assign op_q = inp_pipe_op_q[NUM_INP_REGS];
assign dst_fmt_q = inp_pipe_dst_fmt_q[NUM_INP_REGS];

logic div_valid, sqrt_valid; // input signalling with unit
logic op_starting; // high in the cycle a new operation starts

// Valids are gated by the FSM ready. Invalid input ops run a sqrt to not lose illegal instr.
assign div_valid = inp_pipe_valid_q[NUM_INP_REGS] & (op_q == fpnew_pkg::DIV) & ~flush_i;
assign sqrt_valid = inp_pipe_valid_q[NUM_INP_REGS] & (op_q != fpnew_pkg::DIV) & ~flush_i;
assign op_starting = div_valid | sqrt_valid;

// -----------------
// DIVSQRT instance
// -----------------
logic [1:0][7:0] operands_div8;
logic [1:0][7:0] operands_div8alt;
logic [7:0] operand_sqrt8;
logic [7:0] operand_sqrt8alt;

always_comb begin : silence_inputs_of_unused_units
operands_div8 = '0;
operands_div8alt = '0;
operand_sqrt8 = '0;
operand_sqrt8alt = '0;
if (op_starting) begin
if (div_valid && dst_fmt_q == fpnew_pkg::FP8) begin
operands_div8 = operands_q;
end else if (div_valid && dst_fmt_q == fpnew_pkg::FP8ALT) begin
operands_div8alt = operands_q;
end else if (sqrt_valid && dst_fmt_q == fpnew_pkg::FP8) begin
operand_sqrt8 = operands_q[0];
end else if (sqrt_valid && dst_fmt_q == fpnew_pkg::FP8ALT) begin
operand_sqrt8alt = operands_q[0];
end
end
end

logic [7:0] result_div8;
logic [7:0] result_div8alt;
logic [7:0] result_sqrt8;
logic [7:0] result_sqrt8alt;
logic [4:0] status_div8, status_div8alt, status_sqrt8, status_sqrt8alt;

fpnew_lut_div8 i_div8_lut (
.input_i ({operands_div8[0], operands_div8[1]}),
.out_o (result_div8),
.status_o (status_div8)
);

fpnew_lut_div8alt i_div8alt_lut (
.input_i ({operands_div8alt[0], operands_div8alt[1]}),
.out_o (result_div8alt),
.status_o (status_div8alt)
);

fpnew_lut_sqrt8 i_sqrt8_lut (
.input_i (operand_sqrt8),
.out_o (result_sqrt8),
.status_o (status_sqrt8)
);

fpnew_lut_sqrt8alt i_sqrt8alt_lut (
.input_i (operand_sqrt8alt),
.out_o (result_sqrt8alt),
.status_o (status_sqrt8alt)
);

// --------------
// Output Select
// --------------
logic [7:0] result_d;
fpnew_pkg::status_t status_d;

always_comb begin : select_output
result_d = '0;
if (div_valid && dst_fmt_q == fpnew_pkg::FP8) begin
result_d = result_div8;
status_d = status_div8;
end else if (div_valid && dst_fmt_q == fpnew_pkg::FP8ALT) begin
result_d = result_div8alt;
status_d = status_div8alt;
end else if (sqrt_valid && dst_fmt_q == fpnew_pkg::FP8) begin
result_d = result_sqrt8;
status_d = status_sqrt8;
end else if (sqrt_valid && dst_fmt_q == fpnew_pkg::FP8ALT) begin
result_d = result_sqrt8alt;
status_d = status_sqrt8alt;
end
end

// ----------------
// Output Pipeline
// ----------------
// Output pipeline signals, index i holds signal after i register stages
logic [0:NUM_OUT_REGS][7:0] out_pipe_result_q;
fpnew_pkg::status_t [0:NUM_OUT_REGS] out_pipe_status_q;
TagType [0:NUM_OUT_REGS] out_pipe_tag_q;
logic [0:NUM_OUT_REGS] out_pipe_mask_q;
AuxType [0:NUM_OUT_REGS] out_pipe_aux_q;
logic [0:NUM_OUT_REGS] out_pipe_valid_q;
// Ready signal is combinatorial for all stages
logic [0:NUM_OUT_REGS] out_pipe_ready;

// Input stage: First element of pipeline is taken from inputs
assign out_pipe_result_q[0] = result_d;
assign out_pipe_status_q[0] = status_d;
assign out_pipe_tag_q[0] = inp_pipe_tag_q[NUM_INP_REGS];
assign out_pipe_mask_q[0] = inp_pipe_mask_q[NUM_INP_REGS];
assign out_pipe_aux_q[0] = inp_pipe_aux_q[NUM_INP_REGS];
assign out_pipe_valid_q[0] = inp_pipe_valid_q[NUM_INP_REGS];
// Input stage: Propagate pipeline ready signal to inside pipe
assign inp_pipe_ready[NUM_INP_REGS] = out_pipe_ready[0];
// Generate the register stages
for (genvar i = 0; i < NUM_OUT_REGS; i++) begin : gen_output_pipeline
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage - advance the pipeline:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign out_pipe_ready[i] = out_pipe_ready[i+1] | ~out_pipe_valid_q[i+1];
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(out_pipe_valid_q[i+1], out_pipe_valid_q[i], out_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipleine ready and a valid data item is present
assign reg_ena = (out_pipe_ready[i] & out_pipe_valid_q[i]) | reg_ena_i[NUM_INP_REGS + i];
// Generate the pipeline registers within the stages, use enable-registers
`FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
`FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
`FFL(out_pipe_tag_q[i+1], out_pipe_tag_q[i], reg_ena, TagType'('0))
`FFL(out_pipe_mask_q[i+1], out_pipe_mask_q[i], reg_ena, '0)
`FFL(out_pipe_aux_q[i+1], out_pipe_aux_q[i], reg_ena, AuxType'('0))
end
// Output stage: Ready travels backwards from output side, driven by downstream circuitry
assign out_pipe_ready[NUM_OUT_REGS] = out_ready_i;
// Output stage: assign module outputs
assign result_o = out_pipe_result_q[NUM_OUT_REGS];
assign status_o = out_pipe_status_q[NUM_OUT_REGS];
assign extension_bit_o = 1'b1; // always NaN-Box result
assign tag_o = out_pipe_tag_q[NUM_OUT_REGS];
assign mask_o = out_pipe_mask_q[NUM_OUT_REGS];
assign aux_o = out_pipe_aux_q[NUM_OUT_REGS];
assign out_valid_o = out_pipe_valid_q[NUM_OUT_REGS];
assign busy_o = (| {inp_pipe_valid_q, op_starting, out_pipe_valid_q});

assign divsqrt_done_o = 1'b1;
assign divsqrt_ready_o = 1'b1;
endmodule

Loading