Add MUL_FASTER option to retire fast mul to stage 2 instead of stage 3
This commit is contained in:
parent
254350d300
commit
b7d9defcf2
|
@ -35,6 +35,9 @@ localparam XLEN = W_DATA;
|
||||||
generate if (MULH_FAST && !MUL_FAST)
|
generate if (MULH_FAST && !MUL_FAST)
|
||||||
initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
|
initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
|
||||||
endgenerate
|
endgenerate
|
||||||
|
generate if (MUL_FASTER && !MUL_FAST)
|
||||||
|
initial $fatal("%m: MUL_FASTER requires that MUL_FAST is also set.");
|
||||||
|
endgenerate
|
||||||
//synthesis translate_on
|
//synthesis translate_on
|
||||||
|
|
||||||
// Latency of 1:
|
// Latency of 1:
|
||||||
|
@ -60,11 +63,18 @@ if (!MULH_FAST) begin: mul_only
|
||||||
reg [XLEN-1:0] op_a_r;
|
reg [XLEN-1:0] op_a_r;
|
||||||
reg [XLEN-1:0] op_b_r;
|
reg [XLEN-1:0] op_b_r;
|
||||||
|
|
||||||
always @ (posedge clk) begin
|
if (MUL_FASTER) begin: op_passthrough
|
||||||
|
always @ (*) begin
|
||||||
|
op_a_r = op_a;
|
||||||
|
op_b_r = op_b;
|
||||||
|
end
|
||||||
|
end else begin: op_register
|
||||||
|
always @ (posedge clk) begin
|
||||||
if (op_vld) begin
|
if (op_vld) begin
|
||||||
op_a_r <= op_a;
|
op_a_r <= op_a;
|
||||||
op_b_r <= op_b;
|
op_b_r <= op_b;
|
||||||
end
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
// This should be inferred as 3 DSP tiles on UP5k:
|
// This should be inferred as 3 DSP tiles on UP5k:
|
||||||
|
@ -97,12 +107,20 @@ reg [XLEN-1:0] op_a_r;
|
||||||
reg [XLEN-1:0] op_b_r;
|
reg [XLEN-1:0] op_b_r;
|
||||||
reg [W_MULOP-1:0] op_r;
|
reg [W_MULOP-1:0] op_r;
|
||||||
|
|
||||||
always @ (posedge clk) begin
|
if (MUL_FASTER) begin: op_passthrough
|
||||||
|
always @ (*) begin
|
||||||
|
op_a_r = op_a;
|
||||||
|
op_b_r = op_b;
|
||||||
|
op_r = op;
|
||||||
|
end
|
||||||
|
end else begin: op_register
|
||||||
|
always @ (posedge clk) begin
|
||||||
if (op_vld) begin
|
if (op_vld) begin
|
||||||
op_a_r <= op_a;
|
op_a_r <= op_a;
|
||||||
op_b_r <= op_b;
|
op_b_r <= op_b;
|
||||||
op_r <= op;
|
op_r <= op;
|
||||||
end
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
|
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
|
||||||
|
|
|
@ -151,12 +151,17 @@ parameter REDUCED_BYPASS = 0,
|
||||||
parameter MULDIV_UNROLL = 1,
|
parameter MULDIV_UNROLL = 1,
|
||||||
|
|
||||||
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
|
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
|
||||||
// to stage M. The sequential multiply/divide circuit is still used for MULH*
|
// to stage 3. The sequential multiply/divide circuit is still used for MULH*
|
||||||
parameter MUL_FAST = 0,
|
parameter MUL_FAST = 0,
|
||||||
|
|
||||||
|
// MUL_FASTER: Retire fast multiply results to stage 2 instead of stage 3.
|
||||||
|
// Throughput is the same, but latency is reduced from 2 cycles to 1 cycle.
|
||||||
|
// Requires: MUL_FAST.
|
||||||
|
parameter MUL_FASTER = 0,
|
||||||
|
|
||||||
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
|
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
|
||||||
// the multiply functionality from the sequential multiply/divide circuit.
|
// the multiply functionality from the sequential multiply/divide circuit.
|
||||||
// Requires; MUL_FAST
|
// Requires: MUL_FAST
|
||||||
parameter MULH_FAST = 0,
|
parameter MULH_FAST = 0,
|
||||||
|
|
||||||
// FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch
|
// FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch
|
||||||
|
|
|
@ -43,6 +43,7 @@
|
||||||
.REDUCED_BYPASS (REDUCED_BYPASS),
|
.REDUCED_BYPASS (REDUCED_BYPASS),
|
||||||
.MULDIV_UNROLL (MULDIV_UNROLL),
|
.MULDIV_UNROLL (MULDIV_UNROLL),
|
||||||
.MUL_FAST (MUL_FAST),
|
.MUL_FAST (MUL_FAST),
|
||||||
|
.MUL_FASTER (MUL_FASTER),
|
||||||
.MULH_FAST (MULH_FAST),
|
.MULH_FAST (MULH_FAST),
|
||||||
.FAST_BRANCHCMP (FAST_BRANCHCMP),
|
.FAST_BRANCHCMP (FAST_BRANCHCMP),
|
||||||
.BRANCH_PREDICTOR (BRANCH_PREDICTOR),
|
.BRANCH_PREDICTOR (BRANCH_PREDICTOR),
|
||||||
|
|
|
@ -613,6 +613,13 @@ end
|
||||||
wire [W_DATA-1:0] x_muldiv_result;
|
wire [W_DATA-1:0] x_muldiv_result;
|
||||||
wire [W_DATA-1:0] m_fast_mul_result;
|
wire [W_DATA-1:0] m_fast_mul_result;
|
||||||
|
|
||||||
|
wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
|
||||||
|
MUL_FAST && d_mulop == M_OP_MUL ||
|
||||||
|
MULH_FAST && d_mulop == M_OP_MULH ||
|
||||||
|
MULH_FAST && d_mulop == M_OP_MULHU ||
|
||||||
|
MULH_FAST && d_mulop == M_OP_MULHSU
|
||||||
|
);
|
||||||
|
|
||||||
generate
|
generate
|
||||||
if (EXTENSION_M) begin: has_muldiv
|
if (EXTENSION_M) begin: has_muldiv
|
||||||
wire x_muldiv_op_vld;
|
wire x_muldiv_op_vld;
|
||||||
|
@ -630,13 +637,6 @@ if (EXTENSION_M) begin: has_muldiv
|
||||||
|
|
||||||
wire x_muldiv_kill = m_trap_enter_soon;
|
wire x_muldiv_kill = m_trap_enter_soon;
|
||||||
|
|
||||||
wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
|
|
||||||
MUL_FAST && d_mulop == M_OP_MUL ||
|
|
||||||
MULH_FAST && d_mulop == M_OP_MULH ||
|
|
||||||
MULH_FAST && d_mulop == M_OP_MULHU ||
|
|
||||||
MULH_FAST && d_mulop == M_OP_MULHSU
|
|
||||||
);
|
|
||||||
|
|
||||||
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
|
assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
|
||||||
&& !(x_muldiv_posted || x_stall_on_raw || x_muldiv_kill);
|
&& !(x_muldiv_posted || x_stall_on_raw || x_muldiv_kill);
|
||||||
|
|
||||||
|
@ -668,7 +668,9 @@ if (EXTENSION_M) begin: has_muldiv
|
||||||
|
|
||||||
if (MUL_FAST) begin: has_fast_mul
|
if (MUL_FAST) begin: has_fast_mul
|
||||||
|
|
||||||
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
|
// If MUL_FASTER is set, the multiplier produces its result
|
||||||
|
// combinatorially, so we never have to post a mul result to stage 3.
|
||||||
|
wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall && !MUL_FASTER;
|
||||||
|
|
||||||
hazard3_mul_fast #(
|
hazard3_mul_fast #(
|
||||||
`include "hazard3_config_inst.vh"
|
`include "hazard3_config_inst.vh"
|
||||||
|
@ -995,6 +997,7 @@ always @ (posedge clk or negedge rst_n) begin
|
||||||
xm_result <=
|
xm_result <=
|
||||||
d_csr_ren ? x_csr_rdata :
|
d_csr_ren ? x_csr_rdata :
|
||||||
|EXTENSION_A && x_amo_phase == 3'h3 ? mw_result :
|
|EXTENSION_A && x_amo_phase == 3'h3 ? mw_result :
|
||||||
|
|MUL_FASTER && x_use_fast_mul ? m_fast_mul_result :
|
||||||
|EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result :
|
|EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result :
|
||||||
x_alu_result;
|
x_alu_result;
|
||||||
xm_addr_align <= x_addr_sum[1:0];
|
xm_addr_align <= x_addr_sum[1:0];
|
||||||
|
|
|
@ -16,6 +16,7 @@ PMP_REGIONS := 4
|
||||||
|
|
||||||
MULDIV_UNROLL := 2
|
MULDIV_UNROLL := 2
|
||||||
MUL_FAST := 1
|
MUL_FAST := 1
|
||||||
|
MUL_FASTER := 1
|
||||||
MULH_FAST := 1
|
MULH_FAST := 1
|
||||||
FAST_BRANCHCMP := 1
|
FAST_BRANCHCMP := 1
|
||||||
REDUCED_BYPASS := 0
|
REDUCED_BYPASS := 0
|
||||||
|
@ -43,6 +44,7 @@ SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
|
||||||
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
|
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
|
||||||
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
|
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
|
||||||
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
|
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
|
||||||
|
SYNTH_CMD += chparam -set MUL_FASTER $(MUL_FASTER) $(TOP);
|
||||||
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
|
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
|
||||||
SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP);
|
SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP);
|
||||||
SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP);
|
SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP);
|
||||||
|
|
Loading…
Reference in New Issue