Add option for fast high-half multiplies
This commit is contained in:
parent
35c5e213c7
commit
c8afb4ac33
|
@ -15,21 +15,55 @@
|
|||
* *
|
||||
*********************************************************************/
|
||||
|
||||
// MUL-only (cfg: MUL_FAST) and MUL/MULH/MULHU/MULHSU (cfg: MUL_FAST &&
|
||||
// MULH_FAST) are handled by different circuits. In either case it's a simple
|
||||
// behavioural multiply, and we rely on inference to get good performance on
|
||||
// FPGA.
|
||||
|
||||
`default_nettype none
|
||||
|
||||
module hazard3_mul_fast #(
|
||||
parameter XLEN = 32
|
||||
`include "hazard3_config.vh"
|
||||
,
|
||||
`include "hazard3_width_const.vh"
|
||||
) (
|
||||
input wire clk,
|
||||
input wire rst_n,
|
||||
input wire [XLEN-1:0] op_a,
|
||||
input wire [XLEN-1:0] op_b,
|
||||
input wire op_vld,
|
||||
input wire clk,
|
||||
input wire rst_n,
|
||||
|
||||
output wire [XLEN-1:0] result,
|
||||
input wire [W_MULOP-1:0] op,
|
||||
input wire op_vld,
|
||||
input wire [W_DATA-1:0] op_a,
|
||||
input wire [W_DATA-1:0] op_b,
|
||||
|
||||
output wire [W_DATA-1:0] result,
|
||||
output reg result_vld
|
||||
);
|
||||
|
||||
`include "hazard3_ops.vh"
|
||||
|
||||
localparam XLEN = W_DATA;
|
||||
|
||||
//synthesis translate_off
|
||||
generate if (MULH_FAST && !MUL_FAST)
|
||||
initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
|
||||
endgenerate
|
||||
//synthesis translate_on
|
||||
|
||||
// Latency of 1:
|
||||
always @ (posedge clk or negedge rst_n) begin
|
||||
if (!rst_n) begin
|
||||
result_vld <= 1'b0;
|
||||
end else begin
|
||||
result_vld <= op_vld;
|
||||
end
|
||||
end
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Fast MUL only
|
||||
|
||||
generate
|
||||
if (!MULH_FAST) begin: mul_only
|
||||
|
||||
// This pipestage is folded into the front of the DSP tiles on UP5k. Note the
|
||||
// intention is to register the bypassed core regs at the end of X (since
|
||||
// bypass is quite slow), then perform multiply combinatorially in stage M,
|
||||
|
@ -66,14 +100,55 @@ assign result = result_vld ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
|
|||
|
||||
`endif
|
||||
|
||||
always @ (posedge clk or negedge rst_n) begin
|
||||
if (!rst_n) begin
|
||||
result_vld <= 1'b0;
|
||||
end else begin
|
||||
result_vld <= op_vld;
|
||||
// ----------------------------------------------------------------------------
|
||||
// Fast MUL/MULH/MULHU/MULHSU
|
||||
|
||||
end else begin: mul_and_mulh
|
||||
|
||||
reg [XLEN-1:0] op_a_r;
|
||||
reg [XLEN-1:0] op_b_r;
|
||||
reg [W_MULOP-1:0] op_r;
|
||||
|
||||
always @ (posedge clk) begin
|
||||
if (op_vld) begin
|
||||
op_a_r <= op_a;
|
||||
op_b_r <= op_b;
|
||||
op_r <= op;
|
||||
end
|
||||
end
|
||||
|
||||
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
|
||||
wire op_b_signed = op_r == M_OP_MULH;
|
||||
|
||||
wire [2*XLEN-1:0] op_a_sext = {
|
||||
{XLEN{op_a_r[XLEN - 1] && op_a_signed}},
|
||||
op_a_r
|
||||
};
|
||||
|
||||
wire [2*XLEN-1:0] op_b_sext = {
|
||||
{XLEN{op_b_r[XLEN - 1] && op_b_signed}},
|
||||
op_b_r
|
||||
};
|
||||
|
||||
wire [2*XLEN-1:0] result_full = op_a_sext * op_b_sext;
|
||||
|
||||
`ifndef RISCV_FORMAL_ALTOPS
|
||||
|
||||
assign result = op_r == M_OP_MUL ? result_full[0 +: XLEN] : result_full[XLEN +: XLEN];
|
||||
|
||||
`else
|
||||
|
||||
assign result =
|
||||
op_r == M_OP_MULH ? (op_a_r + op_b_r) ^ 32'hf6583fb7 :
|
||||
op_r == M_OP_MULHSU ? (op_a_r - op_b_r) ^ 32'hecfbe137 :
|
||||
op_r == M_OP_MULHU ? (op_a_r + op_b_r) ^ 32'h949ce5e8 :
|
||||
op_r == M_OP_MUL ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
|
||||
|
||||
`endif
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
endmodule
|
||||
|
||||
`default_nettype wire
|
||||
|
|
|
@ -30,9 +30,8 @@
|
|||
`default_nettype none
|
||||
|
||||
module hazard3_muldiv_seq #(
|
||||
parameter XLEN = 32,
|
||||
parameter UNROLL = 1,
|
||||
parameter W_CTR = $clog2(XLEN + 1), // do not modify
|
||||
`include "hazard3_config.vh"
|
||||
,
|
||||
`include "hazard3_width_const.vh"
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -41,22 +40,25 @@ module hazard3_muldiv_seq #(
|
|||
input wire op_vld,
|
||||
output wire op_rdy,
|
||||
input wire op_kill,
|
||||
input wire [XLEN-1:0] op_a,
|
||||
input wire [XLEN-1:0] op_b,
|
||||
input wire [W_DATA-1:0] op_a,
|
||||
input wire [W_DATA-1:0] op_b,
|
||||
|
||||
output wire [XLEN-1:0] result_h, // mulh* or rem*
|
||||
output wire [XLEN-1:0] result_l, // mul or div*
|
||||
output wire [W_DATA-1:0] result_h, // mulh* or rem*
|
||||
output wire [W_DATA-1:0] result_l, // mul or div*
|
||||
output wire result_vld
|
||||
);
|
||||
|
||||
`include "hazard3_ops.vh"
|
||||
|
||||
//synthesis translate_off
|
||||
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL)
|
||||
initial $fatal("%m: UNROLL must be a positive power of 2");
|
||||
generate if (MULDIV_UNROLL & (MULDIV_UNROLL - 1) || ~|MULDIV_UNROLL)
|
||||
initial $fatal("%m: MULDIV_UNROLL must be a positive power of 2");
|
||||
endgenerate
|
||||
//synthesis translate_on
|
||||
|
||||
localparam XLEN = W_DATA;
|
||||
parameter W_CTR = $clog2(XLEN + 1);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Operation decode, operand sign adjustment
|
||||
|
||||
|
@ -85,7 +87,10 @@ wire op_b_signed =
|
|||
wire op_a_neg = op_a_signed && accum[XLEN-1];
|
||||
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
|
||||
|
||||
wire is_div = op_r[2];
|
||||
// Non-divide parts of the circuit should be constant-folded if all the MUL
|
||||
// operations are handled by the fast multiplier
|
||||
|
||||
wire is_div = op_r[2] || (MUL_FAST && MULH_FAST);
|
||||
|
||||
// Controls for modifying sign of all/part of accumulator
|
||||
wire accum_neg_l;
|
||||
|
@ -109,7 +114,7 @@ always @ (*) begin: alu
|
|||
addend = {2*XLEN{1'b0}};
|
||||
addsub_tmp = {2*XLEN{1'b0}};
|
||||
neg_l_borrow = 1'b0;
|
||||
for (i = 0; i < UNROLL; i = i + 1) begin
|
||||
for (i = 0; i < MULDIV_UNROLL; i = i + 1) begin
|
||||
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
|
||||
shift_tmp = is_div ? accum_next : accum_next >> 1;
|
||||
addsub_tmp = shift_tmp + addend;
|
||||
|
@ -167,11 +172,11 @@ always @ (posedge clk or negedge rst_n) begin
|
|||
if (op_b_neg ^ is_div)
|
||||
op_b_r <= -op_b_r;
|
||||
end else begin
|
||||
ctr <= ctr - UNROLL[W_CTR-1:0];
|
||||
ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
|
||||
accum <= accum_next;
|
||||
end
|
||||
end else if (|ctr) begin
|
||||
ctr <= ctr - UNROLL[W_CTR-1:0];
|
||||
ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
|
||||
accum <= accum_next;
|
||||
end else if (!sign_postadj_done || sign_postadj_carry) begin
|
||||
sign_postadj_done <= 1'b1;
|
||||
|
@ -287,7 +292,7 @@ always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
|
|||
// are forced in immediately, simultaneous with a kill, in which case there
|
||||
// is no intermediate ready state.
|
||||
alive = op_rdy || (op_kill && op_vld);
|
||||
for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1)
|
||||
for (i = 1; i <= XLEN / MULDIV_UNROLL + 3; i = i + 1)
|
||||
alive = alive || $past(op_rdy || (op_kill && op_vld), i);
|
||||
assert(alive);
|
||||
end
|
||||
|
|
|
@ -94,10 +94,14 @@ parameter REDUCED_BYPASS = 0,
|
|||
parameter MULDIV_UNROLL = 1,
|
||||
|
||||
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
|
||||
// to stage M. The sequential multiply/divide circuit is still used for
|
||||
// MULH/MULHU/MULHSU.
|
||||
// to stage M. The sequential multiply/divide circuit is still used for MULH*
|
||||
parameter MUL_FAST = 0,
|
||||
|
||||
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
|
||||
// the multiply functionality from the sequential multiply/divide circuit.
|
||||
// Requires; MUL_FAST
|
||||
parameter MULH_FAST = 0,
|
||||
|
||||
// MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by
|
||||
// making trap vector base partly fixed (legal, as it's WARL).
|
||||
//
|
||||
|
@ -105,7 +109,7 @@ parameter MUL_FAST = 0,
|
|||
//
|
||||
// - Note the entire vector table must always be aligned to its size, rounded
|
||||
// up to a power of two, so careful with the low-order bits.
|
||||
parameter MTVEC_WMASK = 32'hffffffff,
|
||||
parameter MTVEC_WMASK = 32'hfffffffd,
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Port size parameters (do not modify)
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
.REDUCED_BYPASS (REDUCED_BYPASS),
|
||||
.MULDIV_UNROLL (MULDIV_UNROLL),
|
||||
.MUL_FAST (MUL_FAST),
|
||||
.MULH_FAST (MULH_FAST),
|
||||
.MTVEC_WMASK (MTVEC_WMASK),
|
||||
.W_ADDR (W_ADDR),
|
||||
.W_DATA (W_DATA)
|
||||
|
|
|
@ -406,14 +406,18 @@ if (EXTENSION_M) begin: has_muldiv
|
|||
|
||||
wire x_muldiv_kill = m_trap_enter_soon;
|
||||
|
||||
wire x_use_fast_mul = MUL_FAST && d_aluop == ALUOP_MULDIV && d_mulop == M_OP_MUL;
|
||||
wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
|
||||
MUL_FAST && d_mulop == M_OP_MUL ||
|
||||
MULH_FAST && d_mulop == M_OP_MULH ||
|
||||
MULH_FAST && d_mulop == M_OP_MULHU ||
|
||||
MULH_FAST && d_mulop == M_OP_MULHSU
|
||||
);
|
||||
|
||||
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
|
||||
&& !(x_muldiv_posted || x_stall_raw || x_muldiv_kill);
|
||||
|
||||
hazard3_muldiv_seq #(
|
||||
.XLEN (W_DATA),
|
||||
.UNROLL (MULDIV_UNROLL)
|
||||
`include "hazard3_config_inst.vh"
|
||||
) muldiv (
|
||||
.clk (clk),
|
||||
.rst_n (rst_n),
|
||||
|
@ -444,14 +448,15 @@ if (EXTENSION_M) begin: has_muldiv
|
|||
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
|
||||
|
||||
hazard3_mul_fast #(
|
||||
.XLEN(W_DATA)
|
||||
) inst_hazard3_mul_fast (
|
||||
`include "hazard3_config_inst.vh"
|
||||
) mul_fast (
|
||||
.clk (clk),
|
||||
.rst_n (rst_n),
|
||||
|
||||
.op_vld (x_issue_fast_mul),
|
||||
.op (d_mulop),
|
||||
.op_a (x_rs1_bypass),
|
||||
.op_b (x_rs2_bypass),
|
||||
.op_vld (x_issue_fast_mul),
|
||||
|
||||
.result (m_fast_mul_result),
|
||||
.result_vld (m_fast_mul_result_vld)
|
||||
|
|
|
@ -2,11 +2,18 @@ TOP := hazard3_cpu_2port
|
|||
CDEFINES := DUAL_PORT
|
||||
|
||||
CPU_RESET_VECTOR := 32'hc0
|
||||
|
||||
EXTENSION_C := 1
|
||||
EXTENSION_M := 1
|
||||
EXTENSION_ZBA := 1
|
||||
EXTENSION_ZBB := 1
|
||||
EXTENSION_ZBC := 1
|
||||
EXTENSION_ZBS := 1
|
||||
DEBUG_SUPPORT := 0
|
||||
|
||||
MULDIV_UNROLL := 2
|
||||
MUL_FAST := 1
|
||||
MULH_FAST := 1
|
||||
REDUCED_BYPASS := 0
|
||||
|
||||
.PHONY: clean tb all
|
||||
|
@ -16,12 +23,17 @@ all: tb
|
|||
SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f);
|
||||
SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP);
|
||||
SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP);
|
||||
SYNTH_CMD += chparam -set EXTENSION_ZBA $(EXTENSION_ZBA) $(TOP);
|
||||
SYNTH_CMD += chparam -set EXTENSION_ZBB $(EXTENSION_ZBB) $(TOP);
|
||||
SYNTH_CMD += chparam -set EXTENSION_ZBC $(EXTENSION_ZBC) $(TOP);
|
||||
SYNTH_CMD += chparam -set EXTENSION_ZBS $(EXTENSION_ZBS) $(TOP);
|
||||
SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP);
|
||||
SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP);
|
||||
SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
|
||||
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
|
||||
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
|
||||
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
|
||||
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
|
||||
SYNTH_CMD += write_cxxrtl dut.cpp
|
||||
|
||||
dut.cpp:
|
||||
|
|
Loading…
Reference in New Issue