Add MUL_FASTER option to retire fast mul to stage 2 instead of stage 3

2022-07-05 03:37:05 +01:00 · 2022-07-05 03:37:05 +01:00 · b7d9defcf2
parent 254350d300
commit b7d9defcf2
5 changed files with 51 additions and 22 deletions
--- a/hdl/arith/hazard3_mul_fast.v
+++ b/hdl/arith/hazard3_mul_fast.v
@ -35,6 +35,9 @@ localparam XLEN = W_DATA;
 generate if (MULH_FAST && !MUL_FAST)
 	initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
 endgenerate
+generate if (MUL_FASTER && !MUL_FAST)
+	initial $fatal("%m: MUL_FASTER requires that MUL_FAST is also set.");
+endgenerate
 //synthesis translate_on

 // Latency of 1:
@ -60,10 +63,17 @@ if (!MULH_FAST) begin: mul_only
 reg [XLEN-1:0] op_a_r;
 reg [XLEN-1:0] op_b_r;

-always @ (posedge clk) begin
-	if (op_vld) begin
-		op_a_r <= op_a;
-		op_b_r <= op_b;
+if (MUL_FASTER) begin: op_passthrough
+	always @ (*) begin
+		op_a_r = op_a;
+		op_b_r = op_b;
+	end
+end else begin: op_register
+	always @ (posedge clk) begin
+		if (op_vld) begin
+			op_a_r <= op_a;
+			op_b_r <= op_b;
+		end
 	end
 end

@ -97,11 +107,19 @@ reg [XLEN-1:0]    op_a_r;
 reg [XLEN-1:0]    op_b_r;
 reg [W_MULOP-1:0] op_r;

-always @ (posedge clk) begin
-	if (op_vld) begin
-		op_a_r <= op_a;
-		op_b_r <= op_b;
-		op_r <= op;
+if (MUL_FASTER) begin: op_passthrough
+	always @ (*) begin
+		op_a_r = op_a;
+		op_b_r = op_b;
+		op_r = op;
+	end
+end else begin: op_register
+	always @ (posedge clk) begin
+		if (op_vld) begin
+			op_a_r <= op_a;
+			op_b_r <= op_b;
+			op_r <= op;
+		end
 	end
 end

--- a/hdl/hazard3_config.vh
+++ b/hdl/hazard3_config.vh
@ -151,12 +151,17 @@ parameter REDUCED_BYPASS      = 0,
 parameter MULDIV_UNROLL       = 1,

 // MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
-// to stage M. The sequential multiply/divide circuit is still used for MULH*
+// to stage 3. The sequential multiply/divide circuit is still used for MULH*
 parameter MUL_FAST            = 0,

+// MUL_FASTER: Retire fast multiply results to stage 2 instead of stage 3.
+// Throughput is the same, but latency is reduced from 2 cycles to 1 cycle.
+// Requires: MUL_FAST.
+parameter MUL_FASTER          = 0,
+
 // MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
 // the multiply functionality from the sequential multiply/divide circuit.
-// Requires; MUL_FAST
+// Requires: MUL_FAST
 parameter MULH_FAST           = 0,

 // FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch
--- a/hdl/hazard3_config_inst.vh
+++ b/hdl/hazard3_config_inst.vh
@ -43,6 +43,7 @@
 .REDUCED_BYPASS     (REDUCED_BYPASS),
 .MULDIV_UNROLL      (MULDIV_UNROLL),
 .MUL_FAST           (MUL_FAST),
+.MUL_FASTER         (MUL_FASTER),
 .MULH_FAST          (MULH_FAST),
 .FAST_BRANCHCMP     (FAST_BRANCHCMP),
 .BRANCH_PREDICTOR   (BRANCH_PREDICTOR),
--- a/hdl/hazard3_core.v
+++ b/hdl/hazard3_core.v
@ -613,6 +613,13 @@ end
 wire [W_DATA-1:0] x_muldiv_result;
 wire [W_DATA-1:0] m_fast_mul_result;

+wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
+	MUL_FAST  && d_mulop == M_OP_MUL   ||
+	MULH_FAST && d_mulop == M_OP_MULH  ||
+	MULH_FAST && d_mulop == M_OP_MULHU ||
+	MULH_FAST && d_mulop == M_OP_MULHSU
+);
+
 generate
 if (EXTENSION_M) begin: has_muldiv
 	wire              x_muldiv_op_vld;
@ -630,13 +637,6 @@ if (EXTENSION_M) begin: has_muldiv

 	wire x_muldiv_kill = m_trap_enter_soon;

-	wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
-		MUL_FAST  && d_mulop == M_OP_MUL   ||
-		MULH_FAST && d_mulop == M_OP_MULH  ||
-		MULH_FAST && d_mulop == M_OP_MULHU ||
-		MULH_FAST && d_mulop == M_OP_MULHSU
-	);
-
 	assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
 		&& !(x_muldiv_posted || x_stall_on_raw || x_muldiv_kill);

@ -668,7 +668,9 @@ if (EXTENSION_M) begin: has_muldiv

 	if (MUL_FAST) begin: has_fast_mul

-		wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
+		// If MUL_FASTER is set, the multiplier produces its result
+		// combinatorially, so we never have to post a mul result to stage 3.
+		wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall && !MUL_FASTER;

 		hazard3_mul_fast #(
 		`include "hazard3_config_inst.vh"
@ -993,9 +995,10 @@ always @ (posedge clk or negedge rst_n) begin
 		// - Steer captured read phase data in mw_result back through xm_result at end of AMO
 		// - Make sure xm_result (store data) doesn't transition during stalled write dphase
 		xm_result <=
-			d_csr_ren                               ? x_csr_rdata :
-			|EXTENSION_A && x_amo_phase == 3'h3     ? mw_result :
-			|EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result :
+			d_csr_ren                               ? x_csr_rdata       :
+			|EXTENSION_A && x_amo_phase == 3'h3     ? mw_result         :
+			|MUL_FASTER  && x_use_fast_mul          ? m_fast_mul_result :
+			|EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result   :
 			                                          x_alu_result;
 		xm_addr_align <= x_addr_sum[1:0];
 	end
--- a/test/sim/tb_cxxrtl/Makefile
+++ b/test/sim/tb_cxxrtl/Makefile
@ -16,6 +16,7 @@ PMP_REGIONS      := 4

 MULDIV_UNROLL    := 2
 MUL_FAST         := 1
+MUL_FASTER       := 1
 MULH_FAST        := 1
 FAST_BRANCHCMP   := 1
 REDUCED_BYPASS   := 0
@ -43,6 +44,7 @@ SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
 SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
 SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
 SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
+SYNTH_CMD += chparam -set MUL_FASTER $(MUL_FASTER) $(TOP);
 SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
 SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP);
 SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP);