Add option for fast high-half multiplies

This commit is contained in:
Luke Wren 2021-11-29 18:48:02 +00:00
parent 35c5e213c7
commit c8afb4ac33
6 changed files with 137 additions and 35 deletions

View File

@ -15,21 +15,55 @@
* * * *
*********************************************************************/ *********************************************************************/
// MUL-only (cfg: MUL_FAST) and MUL/MULH/MULHU/MULHSU (cfg: MUL_FAST &&
// MULH_FAST) are handled by different circuits. In either case it's a simple
// behavioural multiply, and we rely on inference to get good performance on
// FPGA.
`default_nettype none `default_nettype none
module hazard3_mul_fast #( module hazard3_mul_fast #(
parameter XLEN = 32 `include "hazard3_config.vh"
,
`include "hazard3_width_const.vh"
) ( ) (
input wire clk, input wire clk,
input wire rst_n, input wire rst_n,
input wire [XLEN-1:0] op_a,
input wire [XLEN-1:0] op_b,
input wire op_vld,
output wire [XLEN-1:0] result, input wire [W_MULOP-1:0] op,
input wire op_vld,
input wire [W_DATA-1:0] op_a,
input wire [W_DATA-1:0] op_b,
output wire [W_DATA-1:0] result,
output reg result_vld output reg result_vld
); );
`include "hazard3_ops.vh"
localparam XLEN = W_DATA;
//synthesis translate_off
generate if (MULH_FAST && !MUL_FAST)
initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
endgenerate
//synthesis translate_on
// Latency of 1:
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
result_vld <= 1'b0;
end else begin
result_vld <= op_vld;
end
end
// ----------------------------------------------------------------------------
// Fast MUL only
generate
if (!MULH_FAST) begin: mul_only
// This pipestage is folded into the front of the DSP tiles on UP5k. Note the // This pipestage is folded into the front of the DSP tiles on UP5k. Note the
// intention is to register the bypassed core regs at the end of X (since // intention is to register the bypassed core regs at the end of X (since
// bypass is quite slow), then perform multiply combinatorially in stage M, // bypass is quite slow), then perform multiply combinatorially in stage M,
@ -66,14 +100,55 @@ assign result = result_vld ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
`endif `endif
always @ (posedge clk or negedge rst_n) begin // ----------------------------------------------------------------------------
if (!rst_n) begin // Fast MUL/MULH/MULHU/MULHSU
result_vld <= 1'b0;
end else begin end else begin: mul_and_mulh
result_vld <= op_vld;
reg [XLEN-1:0] op_a_r;
reg [XLEN-1:0] op_b_r;
reg [W_MULOP-1:0] op_r;
always @ (posedge clk) begin
if (op_vld) begin
op_a_r <= op_a;
op_b_r <= op_b;
op_r <= op;
end end
end end
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
wire op_b_signed = op_r == M_OP_MULH;
wire [2*XLEN-1:0] op_a_sext = {
{XLEN{op_a_r[XLEN - 1] && op_a_signed}},
op_a_r
};
wire [2*XLEN-1:0] op_b_sext = {
{XLEN{op_b_r[XLEN - 1] && op_b_signed}},
op_b_r
};
wire [2*XLEN-1:0] result_full = op_a_sext * op_b_sext;
`ifndef RISCV_FORMAL_ALTOPS
assign result = op_r == M_OP_MUL ? result_full[0 +: XLEN] : result_full[XLEN +: XLEN];
`else
assign result =
op_r == M_OP_MULH ? (op_a_r + op_b_r) ^ 32'hf6583fb7 :
op_r == M_OP_MULHSU ? (op_a_r - op_b_r) ^ 32'hecfbe137 :
op_r == M_OP_MULHU ? (op_a_r + op_b_r) ^ 32'h949ce5e8 :
op_r == M_OP_MUL ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
`endif
end
endgenerate
endmodule endmodule
`default_nettype wire `default_nettype wire

View File

@ -30,9 +30,8 @@
`default_nettype none `default_nettype none
module hazard3_muldiv_seq #( module hazard3_muldiv_seq #(
parameter XLEN = 32, `include "hazard3_config.vh"
parameter UNROLL = 1, ,
parameter W_CTR = $clog2(XLEN + 1), // do not modify
`include "hazard3_width_const.vh" `include "hazard3_width_const.vh"
) ( ) (
input wire clk, input wire clk,
@ -41,22 +40,25 @@ module hazard3_muldiv_seq #(
input wire op_vld, input wire op_vld,
output wire op_rdy, output wire op_rdy,
input wire op_kill, input wire op_kill,
input wire [XLEN-1:0] op_a, input wire [W_DATA-1:0] op_a,
input wire [XLEN-1:0] op_b, input wire [W_DATA-1:0] op_b,
output wire [XLEN-1:0] result_h, // mulh* or rem* output wire [W_DATA-1:0] result_h, // mulh* or rem*
output wire [XLEN-1:0] result_l, // mul or div* output wire [W_DATA-1:0] result_l, // mul or div*
output wire result_vld output wire result_vld
); );
`include "hazard3_ops.vh" `include "hazard3_ops.vh"
//synthesis translate_off //synthesis translate_off
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL) generate if (MULDIV_UNROLL & (MULDIV_UNROLL - 1) || ~|MULDIV_UNROLL)
initial $fatal("%m: UNROLL must be a positive power of 2"); initial $fatal("%m: MULDIV_UNROLL must be a positive power of 2");
endgenerate endgenerate
//synthesis translate_on //synthesis translate_on
localparam XLEN = W_DATA;
parameter W_CTR = $clog2(XLEN + 1);
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Operation decode, operand sign adjustment // Operation decode, operand sign adjustment
@ -85,7 +87,10 @@ wire op_b_signed =
wire op_a_neg = op_a_signed && accum[XLEN-1]; wire op_a_neg = op_a_signed && accum[XLEN-1];
wire op_b_neg = op_b_signed && op_b_r[XLEN-1]; wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
wire is_div = op_r[2]; // Non-divide parts of the circuit should be constant-folded if all the MUL
// operations are handled by the fast multiplier
wire is_div = op_r[2] || (MUL_FAST && MULH_FAST);
// Controls for modifying sign of all/part of accumulator // Controls for modifying sign of all/part of accumulator
wire accum_neg_l; wire accum_neg_l;
@ -109,7 +114,7 @@ always @ (*) begin: alu
addend = {2*XLEN{1'b0}}; addend = {2*XLEN{1'b0}};
addsub_tmp = {2*XLEN{1'b0}}; addsub_tmp = {2*XLEN{1'b0}};
neg_l_borrow = 1'b0; neg_l_borrow = 1'b0;
for (i = 0; i < UNROLL; i = i + 1) begin for (i = 0; i < MULDIV_UNROLL; i = i + 1) begin
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}}; addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
shift_tmp = is_div ? accum_next : accum_next >> 1; shift_tmp = is_div ? accum_next : accum_next >> 1;
addsub_tmp = shift_tmp + addend; addsub_tmp = shift_tmp + addend;
@ -167,11 +172,11 @@ always @ (posedge clk or negedge rst_n) begin
if (op_b_neg ^ is_div) if (op_b_neg ^ is_div)
op_b_r <= -op_b_r; op_b_r <= -op_b_r;
end else begin end else begin
ctr <= ctr - UNROLL[W_CTR-1:0]; ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
accum <= accum_next; accum <= accum_next;
end end
end else if (|ctr) begin end else if (|ctr) begin
ctr <= ctr - UNROLL[W_CTR-1:0]; ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
accum <= accum_next; accum <= accum_next;
end else if (!sign_postadj_done || sign_postadj_carry) begin end else if (!sign_postadj_done || sign_postadj_carry) begin
sign_postadj_done <= 1'b1; sign_postadj_done <= 1'b1;
@ -287,7 +292,7 @@ always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
// are forced in immediately, simultaneous with a kill, in which case there // are forced in immediately, simultaneous with a kill, in which case there
// is no intermediate ready state. // is no intermediate ready state.
alive = op_rdy || (op_kill && op_vld); alive = op_rdy || (op_kill && op_vld);
for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1) for (i = 1; i <= XLEN / MULDIV_UNROLL + 3; i = i + 1)
alive = alive || $past(op_rdy || (op_kill && op_vld), i); alive = alive || $past(op_rdy || (op_kill && op_vld), i);
assert(alive); assert(alive);
end end

View File

@ -94,10 +94,14 @@ parameter REDUCED_BYPASS = 0,
parameter MULDIV_UNROLL = 1, parameter MULDIV_UNROLL = 1,
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring // MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
// to stage M. The sequential multiply/divide circuit is still used for // to stage M. The sequential multiply/divide circuit is still used for MULH*
// MULH/MULHU/MULHSU.
parameter MUL_FAST = 0, parameter MUL_FAST = 0,
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
// the multiply functionality from the sequential multiply/divide circuit.
// Requires; MUL_FAST
parameter MULH_FAST = 0,
// MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by // MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by
// making trap vector base partly fixed (legal, as it's WARL). // making trap vector base partly fixed (legal, as it's WARL).
// //
@ -105,7 +109,7 @@ parameter MUL_FAST = 0,
// //
// - Note the entire vector table must always be aligned to its size, rounded // - Note the entire vector table must always be aligned to its size, rounded
// up to a power of two, so careful with the low-order bits. // up to a power of two, so careful with the low-order bits.
parameter MTVEC_WMASK = 32'hffffffff, parameter MTVEC_WMASK = 32'hfffffffd,
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Port size parameters (do not modify) // Port size parameters (do not modify)

View File

@ -21,6 +21,7 @@
.REDUCED_BYPASS (REDUCED_BYPASS), .REDUCED_BYPASS (REDUCED_BYPASS),
.MULDIV_UNROLL (MULDIV_UNROLL), .MULDIV_UNROLL (MULDIV_UNROLL),
.MUL_FAST (MUL_FAST), .MUL_FAST (MUL_FAST),
.MULH_FAST (MULH_FAST),
.MTVEC_WMASK (MTVEC_WMASK), .MTVEC_WMASK (MTVEC_WMASK),
.W_ADDR (W_ADDR), .W_ADDR (W_ADDR),
.W_DATA (W_DATA) .W_DATA (W_DATA)

View File

@ -406,14 +406,18 @@ if (EXTENSION_M) begin: has_muldiv
wire x_muldiv_kill = m_trap_enter_soon; wire x_muldiv_kill = m_trap_enter_soon;
wire x_use_fast_mul = MUL_FAST && d_aluop == ALUOP_MULDIV && d_mulop == M_OP_MUL; wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
MUL_FAST && d_mulop == M_OP_MUL ||
MULH_FAST && d_mulop == M_OP_MULH ||
MULH_FAST && d_mulop == M_OP_MULHU ||
MULH_FAST && d_mulop == M_OP_MULHSU
);
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul) assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
&& !(x_muldiv_posted || x_stall_raw || x_muldiv_kill); && !(x_muldiv_posted || x_stall_raw || x_muldiv_kill);
hazard3_muldiv_seq #( hazard3_muldiv_seq #(
.XLEN (W_DATA), `include "hazard3_config_inst.vh"
.UNROLL (MULDIV_UNROLL)
) muldiv ( ) muldiv (
.clk (clk), .clk (clk),
.rst_n (rst_n), .rst_n (rst_n),
@ -444,14 +448,15 @@ if (EXTENSION_M) begin: has_muldiv
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall; wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
hazard3_mul_fast #( hazard3_mul_fast #(
.XLEN(W_DATA) `include "hazard3_config_inst.vh"
) inst_hazard3_mul_fast ( ) mul_fast (
.clk (clk), .clk (clk),
.rst_n (rst_n), .rst_n (rst_n),
.op_vld (x_issue_fast_mul),
.op (d_mulop),
.op_a (x_rs1_bypass), .op_a (x_rs1_bypass),
.op_b (x_rs2_bypass), .op_b (x_rs2_bypass),
.op_vld (x_issue_fast_mul),
.result (m_fast_mul_result), .result (m_fast_mul_result),
.result_vld (m_fast_mul_result_vld) .result_vld (m_fast_mul_result_vld)

View File

@ -2,11 +2,18 @@ TOP := hazard3_cpu_2port
CDEFINES := DUAL_PORT CDEFINES := DUAL_PORT
CPU_RESET_VECTOR := 32'hc0 CPU_RESET_VECTOR := 32'hc0
EXTENSION_C := 1 EXTENSION_C := 1
EXTENSION_M := 1 EXTENSION_M := 1
EXTENSION_ZBA := 1
EXTENSION_ZBB := 1
EXTENSION_ZBC := 1
EXTENSION_ZBS := 1
DEBUG_SUPPORT := 0 DEBUG_SUPPORT := 0
MULDIV_UNROLL := 2 MULDIV_UNROLL := 2
MUL_FAST := 1 MUL_FAST := 1
MULH_FAST := 1
REDUCED_BYPASS := 0 REDUCED_BYPASS := 0
.PHONY: clean tb all .PHONY: clean tb all
@ -16,12 +23,17 @@ all: tb
SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f); SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f);
SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP); SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP); SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBA $(EXTENSION_ZBA) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBB $(EXTENSION_ZBB) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBC $(EXTENSION_ZBC) $(TOP);
SYNTH_CMD += chparam -set EXTENSION_ZBS $(EXTENSION_ZBS) $(TOP);
SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP); SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP);
SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP); SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP);
SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP); SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP); SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP); SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP); SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
SYNTH_CMD += write_cxxrtl dut.cpp SYNTH_CMD += write_cxxrtl dut.cpp
dut.cpp: dut.cpp: