Add MUL_FASTER option to retire fast mul to stage 2 instead of stage 3
This commit is contained in:
		
							parent
							
								
									254350d300
								
							
						
					
					
						commit
						b7d9defcf2
					
				| 
						 | 
				
			
			@ -35,6 +35,9 @@ localparam XLEN = W_DATA;
 | 
			
		|||
generate if (MULH_FAST && !MUL_FAST)
 | 
			
		||||
	initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set.");
 | 
			
		||||
endgenerate
 | 
			
		||||
generate if (MUL_FASTER && !MUL_FAST)
 | 
			
		||||
	initial $fatal("%m: MUL_FASTER requires that MUL_FAST is also set.");
 | 
			
		||||
endgenerate
 | 
			
		||||
//synthesis translate_on
 | 
			
		||||
 | 
			
		||||
// Latency of 1:
 | 
			
		||||
| 
						 | 
				
			
			@ -60,12 +63,19 @@ if (!MULH_FAST) begin: mul_only
 | 
			
		|||
reg [XLEN-1:0] op_a_r;
 | 
			
		||||
reg [XLEN-1:0] op_b_r;
 | 
			
		||||
 | 
			
		||||
if (MUL_FASTER) begin: op_passthrough
 | 
			
		||||
	always @ (*) begin
 | 
			
		||||
		op_a_r = op_a;
 | 
			
		||||
		op_b_r = op_b;
 | 
			
		||||
	end
 | 
			
		||||
end else begin: op_register
 | 
			
		||||
	always @ (posedge clk) begin
 | 
			
		||||
		if (op_vld) begin
 | 
			
		||||
			op_a_r <= op_a;
 | 
			
		||||
			op_b_r <= op_b;
 | 
			
		||||
		end
 | 
			
		||||
	end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
// This should be inferred as 3 DSP tiles on UP5k:
 | 
			
		||||
//
 | 
			
		||||
| 
						 | 
				
			
			@ -97,6 +107,13 @@ reg [XLEN-1:0]    op_a_r;
 | 
			
		|||
reg [XLEN-1:0]    op_b_r;
 | 
			
		||||
reg [W_MULOP-1:0] op_r;
 | 
			
		||||
 | 
			
		||||
if (MUL_FASTER) begin: op_passthrough
 | 
			
		||||
	always @ (*) begin
 | 
			
		||||
		op_a_r = op_a;
 | 
			
		||||
		op_b_r = op_b;
 | 
			
		||||
		op_r = op;
 | 
			
		||||
	end
 | 
			
		||||
end else begin: op_register
 | 
			
		||||
	always @ (posedge clk) begin
 | 
			
		||||
		if (op_vld) begin
 | 
			
		||||
			op_a_r <= op_a;
 | 
			
		||||
| 
						 | 
				
			
			@ -104,6 +121,7 @@ always @ (posedge clk) begin
 | 
			
		|||
			op_r <= op;
 | 
			
		||||
		end
 | 
			
		||||
	end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU;
 | 
			
		||||
wire op_b_signed = op_r == M_OP_MULH;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -151,12 +151,17 @@ parameter REDUCED_BYPASS      = 0,
 | 
			
		|||
parameter MULDIV_UNROLL       = 1,
 | 
			
		||||
 | 
			
		||||
// MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring
 | 
			
		||||
// to stage M. The sequential multiply/divide circuit is still used for MULH*
 | 
			
		||||
// to stage 3. The sequential multiply/divide circuit is still used for MULH*
 | 
			
		||||
parameter MUL_FAST            = 0,
 | 
			
		||||
 | 
			
		||||
// MUL_FASTER: Retire fast multiply results to stage 2 instead of stage 3.
 | 
			
		||||
// Throughput is the same, but latency is reduced from 2 cycles to 1 cycle.
 | 
			
		||||
// Requires: MUL_FAST.
 | 
			
		||||
parameter MUL_FASTER          = 0,
 | 
			
		||||
 | 
			
		||||
// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove
 | 
			
		||||
// the multiply functionality from the sequential multiply/divide circuit.
 | 
			
		||||
// Requires; MUL_FAST
 | 
			
		||||
// Requires: MUL_FAST
 | 
			
		||||
parameter MULH_FAST           = 0,
 | 
			
		||||
 | 
			
		||||
// FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -43,6 +43,7 @@
 | 
			
		|||
.REDUCED_BYPASS     (REDUCED_BYPASS),
 | 
			
		||||
.MULDIV_UNROLL      (MULDIV_UNROLL),
 | 
			
		||||
.MUL_FAST           (MUL_FAST),
 | 
			
		||||
.MUL_FASTER         (MUL_FASTER),
 | 
			
		||||
.MULH_FAST          (MULH_FAST),
 | 
			
		||||
.FAST_BRANCHCMP     (FAST_BRANCHCMP),
 | 
			
		||||
.BRANCH_PREDICTOR   (BRANCH_PREDICTOR),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -613,6 +613,13 @@ end
 | 
			
		|||
wire [W_DATA-1:0] x_muldiv_result;
 | 
			
		||||
wire [W_DATA-1:0] m_fast_mul_result;
 | 
			
		||||
 | 
			
		||||
wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
 | 
			
		||||
	MUL_FAST  && d_mulop == M_OP_MUL   ||
 | 
			
		||||
	MULH_FAST && d_mulop == M_OP_MULH  ||
 | 
			
		||||
	MULH_FAST && d_mulop == M_OP_MULHU ||
 | 
			
		||||
	MULH_FAST && d_mulop == M_OP_MULHSU
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
generate
 | 
			
		||||
if (EXTENSION_M) begin: has_muldiv
 | 
			
		||||
	wire              x_muldiv_op_vld;
 | 
			
		||||
| 
						 | 
				
			
			@ -630,13 +637,6 @@ if (EXTENSION_M) begin: has_muldiv
 | 
			
		|||
 | 
			
		||||
	wire x_muldiv_kill = m_trap_enter_soon;
 | 
			
		||||
 | 
			
		||||
	wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && (
 | 
			
		||||
		MUL_FAST  && d_mulop == M_OP_MUL   ||
 | 
			
		||||
		MULH_FAST && d_mulop == M_OP_MULH  ||
 | 
			
		||||
		MULH_FAST && d_mulop == M_OP_MULHU ||
 | 
			
		||||
		MULH_FAST && d_mulop == M_OP_MULHSU
 | 
			
		||||
	);
 | 
			
		||||
 | 
			
		||||
	assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul)
 | 
			
		||||
		&& !(x_muldiv_posted || x_stall_on_raw || x_muldiv_kill);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -668,7 +668,9 @@ if (EXTENSION_M) begin: has_muldiv
 | 
			
		|||
 | 
			
		||||
	if (MUL_FAST) begin: has_fast_mul
 | 
			
		||||
 | 
			
		||||
		wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall;
 | 
			
		||||
		// If MUL_FASTER is set, the multiplier produces its result
 | 
			
		||||
		// combinatorially, so we never have to post a mul result to stage 3.
 | 
			
		||||
		wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall && !MUL_FASTER;
 | 
			
		||||
 | 
			
		||||
		hazard3_mul_fast #(
 | 
			
		||||
		`include "hazard3_config_inst.vh"
 | 
			
		||||
| 
						 | 
				
			
			@ -995,6 +997,7 @@ always @ (posedge clk or negedge rst_n) begin
 | 
			
		|||
		xm_result <=
 | 
			
		||||
			d_csr_ren                               ? x_csr_rdata       :
 | 
			
		||||
			|EXTENSION_A && x_amo_phase == 3'h3     ? mw_result         :
 | 
			
		||||
			|MUL_FASTER  && x_use_fast_mul          ? m_fast_mul_result :
 | 
			
		||||
			|EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result   :
 | 
			
		||||
			                                          x_alu_result;
 | 
			
		||||
		xm_addr_align <= x_addr_sum[1:0];
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@ PMP_REGIONS      := 4
 | 
			
		|||
 | 
			
		||||
MULDIV_UNROLL    := 2
 | 
			
		||||
MUL_FAST         := 1
 | 
			
		||||
MUL_FASTER       := 1
 | 
			
		||||
MULH_FAST        := 1
 | 
			
		||||
FAST_BRANCHCMP   := 1
 | 
			
		||||
REDUCED_BYPASS   := 0
 | 
			
		||||
| 
						 | 
				
			
			@ -43,6 +44,7 @@ SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP);
 | 
			
		|||
SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
 | 
			
		||||
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
 | 
			
		||||
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
 | 
			
		||||
SYNTH_CMD += chparam -set MUL_FASTER $(MUL_FASTER) $(TOP);
 | 
			
		||||
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
 | 
			
		||||
SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP);
 | 
			
		||||
SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue