Add option for fast high-half multiplies
This commit is contained in:
parent
35c5e213c7
commit
c8afb4ac33
|
@ -15,21 +15,55 @@
|
||||||
* *
|
* *
|
||||||
*********************************************************************/
|
*********************************************************************/
|
||||||
|
|
||||||
|
// MUL-only (cfg: MUL_FAST) and MUL/MULH/MULHU/MULHSU (cfg: MUL_FAST &&
|
||||||
|
// MULH_FAST) are handled by different circuits. In either case it's a simple
|
||||||
|
// behavioural multiply, and we rely on inference to get good performance on
|
||||||
|
// FPGA.
|
||||||
|
|
||||||
`default_nettype none
|
`default_nettype none
|
||||||
|
|
||||||
module hazard3_mul_fast #(
|
module hazard3_mul_fast #(
|
||||||
parameter XLEN = 32
|
`include "hazard3_config.vh"
|
||||||
|
,
|
||||||
|
`include "hazard3_width_const.vh"
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire rst_n,
|
input wire rst_n,
|
||||||
input wire [XLEN-1:0] op_a,
|
|
||||||
input wire [XLEN-1:0] op_b,
|
|
||||||
input wire op_vld,
|
|
||||||
|
|
||||||
output wire [XLEN-1:0] result,
|
input wire [W_MULOP-1:0] op,
|
||||||
|
input wire op_vld,
|
||||||
|
input wire [W_DATA-1:0] op_a,
|
||||||
|
input wire [W_DATA-1:0] op_b,
|
||||||
|
|
||||||
|
output wire [W_DATA-1:0] result,
|
||||||
output reg result_vld
|
output reg result_vld
|
||||||
);
|
);
|
||||||
|
|
||||||
|
`include "hazard3_ops.vh"
|
||||||
|
|
||||||
|
localparam XLEN = W_DATA;
|
||||||
|
|
||||||
|
//synthesis translate_off
|
||||||
|
generate if (MULH_FAST && !MUL_FAST)
|
||||||
|
initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
|
||||||
|
endgenerate
|
||||||
|
//synthesis translate_on
|
||||||
|
|
||||||
|
// Latency of 1:
|
||||||
|
always @ (posedge clk or negedge rst_n) begin
|
||||||
|
if (!rst_n) begin
|
||||||
|
result_vld <= 1'b0;
|
||||||
|
end else begin
|
||||||
|
result_vld <= op_vld;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
// Fast MUL only
|
||||||
|
|
||||||
|
generate
|
||||||
|
if (!MULH_FAST) begin: mul_only
|
||||||
|
|
||||||
// This pipestage is folded into the front of the DSP tiles on UP5k. Note the
|
// This pipestage is folded into the front of the DSP tiles on UP5k. Note the
|
||||||
// intention is to register the bypassed core regs at the end of X (since
|
// intention is to register the bypassed core regs at the end of X (since
|
||||||
// bypass is quite slow), then perform multiply combinatorially in stage M,
|
// bypass is quite slow), then perform multiply combinatorially in stage M,
|
||||||
|
@ -66,14 +100,55 @@ assign result = result_vld ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
always @ (posedge clk or negedge rst_n) begin
|
// ----------------------------------------------------------------------------
|
||||||
if (!rst_n) begin
|
// Fast MUL/MULH/MULHU/MULHSU
|
||||||
result_vld <= 1'b0;
|
|
||||||
end else begin
|
end else begin: mul_and_mulh
|
||||||
result_vld <= op_vld;
|
|
||||||
|
reg [XLEN-1:0] op_a_r;
|
||||||
|
reg [XLEN-1:0] op_b_r;
|
||||||
|
reg [W_MULOP-1:0] op_r;
|
||||||
|
|
||||||
|
always @ (posedge clk) begin
|
||||||
|
if (op_vld) begin
|
||||||
|
op_a_r <= op_a;
|
||||||
|
op_b_r <= op_b;
|
||||||
|
op_r <= op;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
|
||||||
|
wire op_b_signed = op_r == M_OP_MULH;
|
||||||
|
|
||||||
|
wire [2*XLEN-1:0] op_a_sext = {
|
||||||
|
{XLEN{op_a_r[XLEN - 1] && op_a_signed}},
|
||||||
|
op_a_r
|
||||||
|
};
|
||||||
|
|
||||||
|
wire [2*XLEN-1:0] op_b_sext = {
|
||||||
|
{XLEN{op_b_r[XLEN - 1] && op_b_signed}},
|
||||||
|
op_b_r
|
||||||
|
};
|
||||||
|
|
||||||
|
wire [2*XLEN-1:0] result_full = op_a_sext * op_b_sext;
|
||||||
|
|
||||||
|
`ifndef RISCV_FORMAL_ALTOPS
|
||||||
|
|
||||||
|
assign result = op_r == M_OP_MUL ? result_full[0 +: XLEN] : result_full[XLEN +: XLEN];
|
||||||
|
|
||||||
|
`else
|
||||||
|
|
||||||
|
assign result =
|
||||||
|
op_r == M_OP_MULH ? (op_a_r + op_b_r) ^ 32'hf6583fb7 :
|
||||||
|
op_r == M_OP_MULHSU ? (op_a_r - op_b_r) ^ 32'hecfbe137 :
|
||||||
|
op_r == M_OP_MULHU ? (op_a_r + op_b_r) ^ 32'h949ce5e8 :
|
||||||
|
op_r == M_OP_MUL ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef;
|
||||||
|
|
||||||
|
`endif
|
||||||
|
|
||||||
|
end
|
||||||
|
endgenerate
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|
||||||
`default_nettype wire
|
`default_nettype wire
|
||||||
|
|
|
@ -30,9 +30,8 @@
|
||||||
`default_nettype none
|
`default_nettype none
|
||||||
|
|
||||||
module hazard3_muldiv_seq #(
|
module hazard3_muldiv_seq #(
|
||||||
parameter XLEN = 32,
|
`include "hazard3_config.vh"
|
||||||
parameter UNROLL = 1,
|
,
|
||||||
parameter W_CTR = $clog2(XLEN + 1), // do not modify
|
|
||||||
`include "hazard3_width_const.vh"
|
`include "hazard3_width_const.vh"
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
|
@ -41,22 +40,25 @@ module hazard3_muldiv_seq #(
|
||||||
input wire op_vld,
|
input wire op_vld,
|
||||||
output wire op_rdy,
|
output wire op_rdy,
|
||||||
input wire op_kill,
|
input wire op_kill,
|
||||||
input wire [XLEN-1:0] op_a,
|
input wire [W_DATA-1:0] op_a,
|
||||||
input wire [XLEN-1:0] op_b,
|
input wire [W_DATA-1:0] op_b,
|
||||||
|
|
||||||
output wire [XLEN-1:0] result_h, // mulh* or rem*
|
output wire [W_DATA-1:0] result_h, // mulh* or rem*
|
||||||
output wire [XLEN-1:0] result_l, // mul or div*
|
output wire [W_DATA-1:0] result_l, // mul or div*
|
||||||
output wire result_vld
|
output wire result_vld
|
||||||
);
|
);
|
||||||
|
|
||||||
`include "hazard3_ops.vh"
|
`include "hazard3_ops.vh"
|
||||||
|
|
||||||
//synthesis translate_off
|
//synthesis translate_off
|
||||||
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL)
|
generate if (MULDIV_UNROLL & (MULDIV_UNROLL - 1) || ~|MULDIV_UNROLL)
|
||||||
initial $fatal("%m: UNROLL must be a positive power of 2");
|
initial $fatal("%m: MULDIV_UNROLL must be a positive power of 2");
|
||||||
endgenerate
|
endgenerate
|
||||||
//synthesis translate_on
|
//synthesis translate_on
|
||||||
|
|
||||||
|
localparam XLEN = W_DATA;
|
||||||
|
parameter W_CTR = $clog2(XLEN + 1);
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Operation decode, operand sign adjustment
|
// Operation decode, operand sign adjustment
|
||||||
|
|
||||||
|
@ -85,7 +87,10 @@ wire op_b_signed =
|
||||||
wire op_a_neg = op_a_signed && accum[XLEN-1];
|
wire op_a_neg = op_a_signed && accum[XLEN-1];
|
||||||
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
|
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
|
||||||
|
|
||||||
wire is_div = op_r[2];
|
// Non-divide parts of the circuit should be constant-folded if all the MUL
|
||||||
|
// operations are handled by the fast multiplier
|
||||||
|
|
||||||
|
wire is_div = op_r[2] || (MUL_FAST && MULH_FAST);
|
||||||
|
|
||||||
// Controls for modifying sign of all/part of accumulator
|
// Controls for modifying sign of all/part of accumulator
|
||||||
wire accum_neg_l;
|
wire accum_neg_l;
|
||||||
|
@ -109,7 +114,7 @@ always @ (*) begin: alu
|
||||||
addend = {2*XLEN{1'b0}};
|
addend = {2*XLEN{1'b0}};
|
||||||
addsub_tmp = {2*XLEN{1'b0}};
|
addsub_tmp = {2*XLEN{1'b0}};
|
||||||
neg_l_borrow = 1'b0;
|
neg_l_borrow = 1'b0;
|
||||||
for (i = 0; i < UNROLL; i = i + 1) begin
|
for (i = 0; i < MULDIV_UNROLL; i = i + 1) begin
|
||||||
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
|
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
|
||||||
shift_tmp = is_div ? accum_next : accum_next >> 1;
|
shift_tmp = is_div ? accum_next : accum_next >> 1;
|
||||||
addsub_tmp = shift_tmp + addend;
|
addsub_tmp = shift_tmp + addend;
|
||||||
|
@ -167,11 +172,11 @@ always @ (posedge clk or negedge rst_n) begin
|
||||||
if (op_b_neg ^ is_div)
|
if (op_b_neg ^ is_div)
|
||||||
op_b_r <= -op_b_r;
|
op_b_r <= -op_b_r;
|
||||||
end else begin
|
end else begin
|
||||||
ctr <= ctr - UNROLL[W_CTR-1:0];
|
ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
|
||||||
accum <= accum_next;
|
accum <= accum_next;
|
||||||
end
|
end
|
||||||
end else if (|ctr) begin
|
end else if (|ctr) begin
|
||||||
ctr <= ctr - UNROLL[W_CTR-1:0];
|
ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0];
|
||||||
accum <= accum_next;
|
accum <= accum_next;
|
||||||
end else if (!sign_postadj_done || sign_postadj_carry) begin
|
end else if (!sign_postadj_done || sign_postadj_carry) begin
|
||||||
sign_postadj_done <= 1'b1;
|
sign_postadj_done <= 1'b1;
|
||||||
|
@ -287,7 +292,7 @@ always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
|
||||||
// are forced in immediately, simultaneous with a kill, in which case there
|
// are forced in immediately, simultaneous with a kill, in which case there
|
||||||
// is no intermediate ready state.
|
// is no intermediate ready state.
|
||||||
alive = op_rdy || (op_kill && op_vld);
|
alive = op_rdy || (op_kill && op_vld);
|
||||||
for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1)
|
for (i = 1; i <= XLEN / MULDIV_UNROLL + 3; i = i + 1)
|
||||||
alive = alive || $past(op_rdy || (op_kill && op_vld), i);
|
alive = alive || $past(op_rdy || (op_kill && op_vld), i);
|
||||||
assert(alive);
|
assert(alive);
|
||||||
end
|
end
|
||||||
|
|
|
@ -94,10 +94,14 @@ parameter REDUCED_BYPASS = 0,
|
||||||
parameter MULDIV_UNROLL = 1,
|
parameter MULDIV_UNROLL = 1,
|
||||||
|
|
||||||
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
|
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
|
||||||
// to stage M. The sequential multiply/divide circuit is still used for
|
// to stage M. The sequential multiply/divide circuit is still used for MULH*
|
||||||
// MULH/MULHU/MULHSU.
|
|
||||||
parameter MUL_FAST = 0,
|
parameter MUL_FAST = 0,
|
||||||
|
|
||||||
|
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
|
||||||
|
// the multiply functionality from the sequential multiply/divide circuit.
|
||||||
|
// Requires; MUL_FAST
|
||||||
|
parameter MULH_FAST = 0,
|
||||||
|
|
||||||
// MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by
|
// MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by
|
||||||
// making trap vector base partly fixed (legal, as it's WARL).
|
// making trap vector base partly fixed (legal, as it's WARL).
|
||||||
//
|
//
|
||||||
|
@ -105,7 +109,7 @@ parameter MUL_FAST = 0,
|
||||||
//
|
//
|
||||||
// - Note the entire vector table must always be aligned to its size, rounded
|
// - Note the entire vector table must always be aligned to its size, rounded
|
||||||
// up to a power of two, so careful with the low-order bits.
|
// up to a power of two, so careful with the low-order bits.
|
||||||
parameter MTVEC_WMASK = 32'hffffffff,
|
parameter MTVEC_WMASK = 32'hfffffffd,
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
// Port size parameters (do not modify)
|
// Port size parameters (do not modify)
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
.REDUCED_BYPASS (REDUCED_BYPASS),
|
.REDUCED_BYPASS (REDUCED_BYPASS),
|
||||||
.MULDIV_UNROLL (MULDIV_UNROLL),
|
.MULDIV_UNROLL (MULDIV_UNROLL),
|
||||||
.MUL_FAST (MUL_FAST),
|
.MUL_FAST (MUL_FAST),
|
||||||
|
.MULH_FAST (MULH_FAST),
|
||||||
.MTVEC_WMASK (MTVEC_WMASK),
|
.MTVEC_WMASK (MTVEC_WMASK),
|
||||||
.W_ADDR (W_ADDR),
|
.W_ADDR (W_ADDR),
|
||||||
.W_DATA (W_DATA)
|
.W_DATA (W_DATA)
|
||||||
|
|
|
@ -406,14 +406,18 @@ if (EXTENSION_M) begin: has_muldiv
|
||||||
|
|
||||||
wire x_muldiv_kill = m_trap_enter_soon;
|
wire x_muldiv_kill = m_trap_enter_soon;
|
||||||
|
|
||||||
wire x_use_fast_mul = MUL_FAST && d_aluop == ALUOP_MULDIV && d_mulop == M_OP_MUL;
|
wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
|
||||||
|
MUL_FAST && d_mulop == M_OP_MUL ||
|
||||||
|
MULH_FAST && d_mulop == M_OP_MULH ||
|
||||||
|
MULH_FAST && d_mulop == M_OP_MULHU ||
|
||||||
|
MULH_FAST && d_mulop == M_OP_MULHSU
|
||||||
|
);
|
||||||
|
|
||||||
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
|
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
|
||||||
&& !(x_muldiv_posted || x_stall_raw || x_muldiv_kill);
|
&& !(x_muldiv_posted || x_stall_raw || x_muldiv_kill);
|
||||||
|
|
||||||
hazard3_muldiv_seq #(
|
hazard3_muldiv_seq #(
|
||||||
.XLEN (W_DATA),
|
`include "hazard3_config_inst.vh"
|
||||||
.UNROLL (MULDIV_UNROLL)
|
|
||||||
) muldiv (
|
) muldiv (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.rst_n (rst_n),
|
.rst_n (rst_n),
|
||||||
|
@ -444,14 +448,15 @@ if (EXTENSION_M) begin: has_muldiv
|
||||||
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
|
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
|
||||||
|
|
||||||
hazard3_mul_fast #(
|
hazard3_mul_fast #(
|
||||||
.XLEN(W_DATA)
|
`include "hazard3_config_inst.vh"
|
||||||
) inst_hazard3_mul_fast (
|
) mul_fast (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.rst_n (rst_n),
|
.rst_n (rst_n),
|
||||||
|
|
||||||
|
.op_vld (x_issue_fast_mul),
|
||||||
|
.op (d_mulop),
|
||||||
.op_a (x_rs1_bypass),
|
.op_a (x_rs1_bypass),
|
||||||
.op_b (x_rs2_bypass),
|
.op_b (x_rs2_bypass),
|
||||||
.op_vld (x_issue_fast_mul),
|
|
||||||
|
|
||||||
.result (m_fast_mul_result),
|
.result (m_fast_mul_result),
|
||||||
.result_vld (m_fast_mul_result_vld)
|
.result_vld (m_fast_mul_result_vld)
|
||||||
|
|
|
@ -2,11 +2,18 @@ TOP := hazard3_cpu_2port
|
||||||
CDEFINES := DUAL_PORT
|
CDEFINES := DUAL_PORT
|
||||||
|
|
||||||
CPU_RESET_VECTOR := 32'hc0
|
CPU_RESET_VECTOR := 32'hc0
|
||||||
|
|
||||||
EXTENSION_C := 1
|
EXTENSION_C := 1
|
||||||
EXTENSION_M := 1
|
EXTENSION_M := 1
|
||||||
|
EXTENSION_ZBA := 1
|
||||||
|
EXTENSION_ZBB := 1
|
||||||
|
EXTENSION_ZBC := 1
|
||||||
|
EXTENSION_ZBS := 1
|
||||||
DEBUG_SUPPORT := 0
|
DEBUG_SUPPORT := 0
|
||||||
|
|
||||||
MULDIV_UNROLL := 2
|
MULDIV_UNROLL := 2
|
||||||
MUL_FAST := 1
|
MUL_FAST := 1
|
||||||
|
MULH_FAST := 1
|
||||||
REDUCED_BYPASS := 0
|
REDUCED_BYPASS := 0
|
||||||
|
|
||||||
.PHONY: clean tb all
|
.PHONY: clean tb all
|
||||||
|
@ -16,12 +23,17 @@ all: tb
|
||||||
SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f);
|
SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f);
|
||||||
SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP);
|
SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP);
|
||||||
SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP);
|
SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP);
|
||||||
|
SYNTH_CMD += chparam -set EXTENSION_ZBA $(EXTENSION_ZBA) $(TOP);
|
||||||
|
SYNTH_CMD += chparam -set EXTENSION_ZBB $(EXTENSION_ZBB) $(TOP);
|
||||||
|
SYNTH_CMD += chparam -set EXTENSION_ZBC $(EXTENSION_ZBC) $(TOP);
|
||||||
|
SYNTH_CMD += chparam -set EXTENSION_ZBS $(EXTENSION_ZBS) $(TOP);
|
||||||
SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP);
|
SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP);
|
||||||
SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP);
|
SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP);
|
||||||
SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
|
SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
|
||||||
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
|
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
|
||||||
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
|
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
|
||||||
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
|
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
|
||||||
|
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
|
||||||
SYNTH_CMD += write_cxxrtl dut.cpp
|
SYNTH_CMD += write_cxxrtl dut.cpp
|
||||||
|
|
||||||
dut.cpp:
|
dut.cpp:
|
||||||
|
|
Loading…
Reference in New Issue