Add option for fast high-half multiplies

This commit is contained in:
Luke Wren 2021-11-29 18:48:02 +00:00
parent 35c5e213c7
commit c8afb4ac33
6 changed files with 137 additions and 35 deletions

View File

@ -15,21 +15,55 @@
* *
*********************************************************************/
// MUL-only (cfg: MUL_FAST) and MUL/MULH/MULHU/MULHSU (cfg: MUL_FAST &&
// MULH_FAST) are handled by different circuits. In either case it's a simple
// behavioural multiply, and we rely on inference to get good performance on
// FPGA.
`default_nettype none
module hazard3_mul_fast #(
parameter XLEN = 32
`include "hazard3_config.vh"
,
`include "hazard3_width_const.vh"
) (
input wire clk,
input wire rst_n,
input wire [XLEN-1:0] op_a,
input wire [XLEN-1:0] op_b,
input wire op_vld,
input wire clk,
input wire rst_n,
output wire [XLEN-1:0] result,
input wire [W_MULOP-1:0] op,
input wire op_vld,
input wire [W_DATA-1:0] op_a,
input wire [W_DATA-1:0] op_b,
output wire [W_DATA-1:0] result,
output reg result_vld
);
`include "hazard3_ops.vh"
localparam XLEN = W_DATA;
//synthesis translate_off
generate if (MULH_FAST && !MUL_FAST)
initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
endgenerate
//synthesis translate_on
// Latency of 1:
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
result_vld <= 1'b0;
end else begin
result_vld <= op_vld;
end
end
// ----------------------------------------------------------------------------
// Fast MUL only
generate
if (!MULH_FAST) begin: mul_only
// This pipestage is folded into the front of the DSP tiles on UP5k. Note the
// intention is to register the bypassed core regs at the end of X (since
// bypass is quite slow), then perform multiply combinatorially in stage M,
@ -66,14 +100,55 @@ assign result = result_vld ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
`endif
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
result_vld <= 1'b0;
end else begin
result_vld <= op_vld;
// ----------------------------------------------------------------------------
// Fast MUL/MULH/MULHU/MULHSU
end else begin: mul_and_mulh
reg [XLEN-1:0] op_a_r;
reg [XLEN-1:0] op_b_r;
reg [W_MULOP-1:0] op_r;
always @ (posedge clk) begin
if (op_vld) begin
op_a_r <= op_a;
op_b_r <= op_b;
op_r <= op;
end
end
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
wire op_b_signed = op_r == M_OP_MULH;
wire [2*XLEN-1:0] op_a_sext = {
{XLEN{op_a_r[XLEN - 1] && op_a_signed}},
op_a_r
};
wire [2*XLEN-1:0] op_b_sext = {
{XLEN{op_b_r[XLEN - 1] && op_b_signed}},
op_b_r
};
wire [2*XLEN-1:0] result_full = op_a_sext * op_b_sext;
`ifndef RISCV_FORMAL_ALTOPS
assign result = op_r == M_OP_MUL ? result_full[0 +: XLEN] : result_full[XLEN +: XLEN];
`else
assign result =
op_r == M_OP_MULH ? (op_a_r + op_b_r) ^ 32'hf6583fb7 :
op_r == M_OP_MULHSU ? (op_a_r - op_b_r) ^ 32'hecfbe137 :
op_r == M_OP_MULHU ? (op_a_r + op_b_r) ^ 32'h949ce5e8 :
op_r == M_OP_MUL ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
`endif
end
endgenerate
endmodule
`default_nettype wire

View File

@ -30,9 +30,8 @@
`default_nettype none
module hazard3_muldiv_seq #(
parameter XLEN = 32,
parameter UNROLL = 1,
parameter W_CTR = $clog2(XLEN + 1), // do not modify
`include "hazard3_config.vh"
,
`include "hazard3_width_const.vh"
) (
input wire clk,
@ -41,22 +40,25 @@ module hazard3_muldiv_seq #(
input wire op_vld,
output wire op_rdy,
input wire op_kill,
input wire [XLEN-1:0] op_a,
input wire [XLEN-1:0] op_b,
input wire [W_DATA-1:0] op_a,
input wire [W_DATA-1:0] op_b,
output wire [XLEN-1:0] result_h, // mulh* or rem*
output wire [XLEN-1:0] result_l, // mul or div*
output wire [W_DATA-1:0] result_h, // mulh* or rem*
output wire [W_DATA-1:0] result_l, // mul or div*
output wire result_vld
);
`include "hazard3_ops.vh"
//synthesis translate_off
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL)
initial $fatal("%m: UNROLL must be a positive power of 2");
generate if (MULDIV_UNROLL & (MULDIV_UNROLL - 1) || ~|MULDIV_UNROLL)
initial $fatal("%m: MULDIV_UNROLL must be a positive power of 2");
endgenerate
//synthesis translate_on
localparam XLEN = W_DATA;
parameter W_CTR = $clog2(XLEN + 1);
// ----------------------------------------------------------------------------
// Operation decode, operand sign adjustment
@ -85,7 +87,10 @@ wire op_b_signed =
wire op_a_neg = op_a_signed && accum[XLEN-1];
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
wire is_div = op_r[2];
// Non-divide parts of the circuit should be constant-folded if all the MUL
// operations are handled by the fast multiplier
wire is_div = op_r[2] || (MUL_FAST && MULH_FAST);
// Controls for modifying sign of all/part of accumulator
wire accum_neg_l;
@ -109,7 +114,7 @@ always @ (*) begin: alu
addend = {2*XLEN{1'b0}};
addsub_tmp = {2*XLEN{1'b0}};
neg_l_borrow = 1'b0;
for (i = 0; i < UNROLL; i = i + 1) begin
for (i = 0; i < MULDIV_UNROLL; i = i + 1) begin
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
shift_tmp = is_div ? accum_next : accum_next >> 1;
addsub_tmp = shift_tmp + addend;
@ -167,11 +172,11 @@ always @ (posedge clk or negedge rst_n) begin
if (op_b_neg ^ is_div)
op_b_r <= -op_b_r;
end else begin
ctr <= ctr - UNROLL[W_CTR-1:0];
ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
accum <= accum_next;
end
end else if (|ctr) begin
ctr <= ctr - UNROLL[W_CTR-1:0];
ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
accum <= accum_next;
end else if (!sign_postadj_done || sign_postadj_carry) begin
sign_postadj_done <= 1'b1;
@ -287,7 +292,7 @@ always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
// are forced in immediately, simultaneous with a kill, in which case there
// is no intermediate ready state.
alive = op_rdy || (op_kill && op_vld);
for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1)
for (i = 1; i <= XLEN / MULDIV_UNROLL + 3; i = i + 1)
alive = alive || $past(op_rdy || (op_kill && op_vld), i);
assert(alive);
end

View File

@ -94,10 +94,14 @@ parameter REDUCED_BYPASS = 0,
parameter MULDIV_UNROLL = 1,
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
// to stage M. The sequential multiply/divide circuit is still used for
// MULH/MULHU/MULHSU.
// to stage M. The sequential multiply/divide circuit is still used for MULH*
parameter MUL_FAST = 0,
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
// the multiply functionality from the sequential multiply/divide circuit.
// Requires; MUL_FAST
parameter MULH_FAST = 0,
// MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by
// making trap vector base partly fixed (legal, as it's WARL).
//
@ -105,7 +109,7 @@ parameter MUL_FAST = 0,
//
// - Note the entire vector table must always be aligned to its size, rounded
// up to a power of two, so careful with the low-order bits.
parameter MTVEC_WMASK = 32'hffffffff,
parameter MTVEC_WMASK = 32'hfffffffd,
// ----------------------------------------------------------------------------
// Port size parameters (do not modify)

View File

@ -21,6 +21,7 @@
.REDUCED_BYPASS (REDUCED_BYPASS),
.MULDIV_UNROLL (MULDIV_UNROLL),
.MUL_FAST (MUL_FAST),
.MULH_FAST (MULH_FAST),
.MTVEC_WMASK (MTVEC_WMASK),
.W_ADDR (W_ADDR),
.W_DATA (W_DATA)

View File

@ -406,14 +406,18 @@ if (EXTENSION_M) begin: has_muldiv
wire x_muldiv_kill = m_trap_enter_soon;
wire x_use_fast_mul = MUL_FAST && d_aluop == ALUOP_MULDIV && d_mulop == M_OP_MUL;
wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
MUL_FAST && d_mulop == M_OP_MUL ||
MULH_FAST && d_mulop == M_OP_MULH ||
MULH_FAST && d_mulop == M_OP_MULHU ||
MULH_FAST && d_mulop == M_OP_MULHSU
);
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
&& !(x_muldiv_posted || x_stall_raw || x_muldiv_kill);
hazard3_muldiv_seq #(
.XLEN (W_DATA),
.UNROLL (MULDIV_UNROLL)
`include "hazard3_config_inst.vh"
) muldiv (
.clk (clk),
.rst_n (rst_n),
@ -444,14 +448,15 @@ if (EXTENSION_M) begin: has_muldiv
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
hazard3_mul_fast #(
.XLEN(W_DATA)
) inst_hazard3_mul_fast (
`include "hazard3_config_inst.vh"
) mul_fast (
.clk (clk),
.rst_n (rst_n),
.op_vld (x_issue_fast_mul),
.op (d_mulop),
.op_a (x_rs1_bypass),
.op_b (x_rs2_bypass),
.op_vld (x_issue_fast_mul),
.result (m_fast_mul_result),
.result_vld (m_fast_mul_result_vld)

View File

@ -2,11 +2,18 @@ TOP := hazard3_cpu_2port
CDEFINES := DUAL_PORT
CPU_RESET_VECTOR := 32'hc0
EXTENSION_C := 1
EXTENSION_M := 1
EXTENSION_ZBA := 1
EXTENSION_ZBB := 1
EXTENSION_ZBC := 1
EXTENSION_ZBS := 1
DEBUG_SUPPORT := 0
MULDIV_UNROLL := 2
MUL_FAST := 1
MULH_FAST := 1
REDUCED_BYPASS := 0
.PHONY: clean tb all
@ -16,12 +23,17 @@ all: tb
SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f);
SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBA $(EXTENSION_ZBA) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBB $(EXTENSION_ZBB) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBC $(EXTENSION_ZBC) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBS $(EXTENSION_ZBS) $(TOP);
SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP);
SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP);
SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
SYNTH_CMD += write_cxxrtl dut.cpp
dut.cpp: