From b7d9defcf2ff831835a62f574e63993111159d73 Mon Sep 17 00:00:00 2001 From: Luke Wren Date: Tue, 5 Jul 2022 03:37:05 +0100 Subject: [PATCH] Add MUL_FASTER option to retire fast mul to stage 2 instead of stage 3 --- hdl/arith/hazard3_mul_fast.v | 36 +++++++++++++++++++++++++++--------- hdl/hazard3_config.vh | 9 +++++++-- hdl/hazard3_config_inst.vh | 1 + hdl/hazard3_core.v | 25 ++++++++++++++----------- test/sim/tb_cxxrtl/Makefile | 2 ++ 5 files changed, 51 insertions(+), 22 deletions(-) diff --git a/hdl/arith/hazard3_mul_fast.v b/hdl/arith/hazard3_mul_fast.v index 78ba272..b066165 100644 --- a/hdl/arith/hazard3_mul_fast.v +++ b/hdl/arith/hazard3_mul_fast.v @@ -35,6 +35,9 @@ localparam XLEN = W_DATA; generate if (MULH_FAST && !MUL_FAST) initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set."); endgenerate +generate if (MUL_FASTER && !MUL_FAST) + initial $fatal("%m: MUL_FASTER requires that MUL_FAST is also set."); +endgenerate //synthesis translate_on // Latency of 1: @@ -60,10 +63,17 @@ if (!MULH_FAST) begin: mul_only reg [XLEN-1:0] op_a_r; reg [XLEN-1:0] op_b_r; -always @ (posedge clk) begin - if (op_vld) begin - op_a_r <= op_a; - op_b_r <= op_b; +if (MUL_FASTER) begin: op_passthrough + always @ (*) begin + op_a_r = op_a; + op_b_r = op_b; + end +end else begin: op_register + always @ (posedge clk) begin + if (op_vld) begin + op_a_r <= op_a; + op_b_r <= op_b; + end end end @@ -97,11 +107,19 @@ reg [XLEN-1:0] op_a_r; reg [XLEN-1:0] op_b_r; reg [W_MULOP-1:0] op_r; -always @ (posedge clk) begin - if (op_vld) begin - op_a_r <= op_a; - op_b_r <= op_b; - op_r <= op; +if (MUL_FASTER) begin: op_passthrough + always @ (*) begin + op_a_r = op_a; + op_b_r = op_b; + op_r = op; + end +end else begin: op_register + always @ (posedge clk) begin + if (op_vld) begin + op_a_r <= op_a; + op_b_r <= op_b; + op_r <= op; + end end end diff --git a/hdl/hazard3_config.vh b/hdl/hazard3_config.vh index b7961db..4be454c 100644 --- a/hdl/hazard3_config.vh +++ b/hdl/hazard3_config.vh @@ -151,12 +151,17 @@ parameter REDUCED_BYPASS = 0, parameter MULDIV_UNROLL = 1, // MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring -// to stage M. The sequential multiply/divide circuit is still used for MULH* +// to stage 3. The sequential multiply/divide circuit is still used for MULH* parameter MUL_FAST = 0, +// MUL_FASTER: Retire fast multiply results to stage 2 instead of stage 3. +// Throughput is the same, but latency is reduced from 2 cycles to 1 cycle. +// Requires: MUL_FAST. +parameter MUL_FASTER = 0, + // MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove // the multiply functionality from the sequential multiply/divide circuit. -// Requires; MUL_FAST +// Requires: MUL_FAST parameter MULH_FAST = 0, // FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch diff --git a/hdl/hazard3_config_inst.vh b/hdl/hazard3_config_inst.vh index 7d63dd4..cf7ef94 100644 --- a/hdl/hazard3_config_inst.vh +++ b/hdl/hazard3_config_inst.vh @@ -43,6 +43,7 @@ .REDUCED_BYPASS (REDUCED_BYPASS), .MULDIV_UNROLL (MULDIV_UNROLL), .MUL_FAST (MUL_FAST), +.MUL_FASTER (MUL_FASTER), .MULH_FAST (MULH_FAST), .FAST_BRANCHCMP (FAST_BRANCHCMP), .BRANCH_PREDICTOR (BRANCH_PREDICTOR), diff --git a/hdl/hazard3_core.v b/hdl/hazard3_core.v index 84ef382..564233a 100644 --- a/hdl/hazard3_core.v +++ b/hdl/hazard3_core.v @@ -613,6 +613,13 @@ end wire [W_DATA-1:0] x_muldiv_result; wire [W_DATA-1:0] m_fast_mul_result; +wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && ( + MUL_FAST && d_mulop == M_OP_MUL || + MULH_FAST && d_mulop == M_OP_MULH || + MULH_FAST && d_mulop == M_OP_MULHU || + MULH_FAST && d_mulop == M_OP_MULHSU +); + generate if (EXTENSION_M) begin: has_muldiv wire x_muldiv_op_vld; @@ -630,13 +637,6 @@ if (EXTENSION_M) begin: has_muldiv wire x_muldiv_kill = m_trap_enter_soon; - wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && ( - MUL_FAST && d_mulop == M_OP_MUL || - MULH_FAST && d_mulop == M_OP_MULH || - MULH_FAST && d_mulop == M_OP_MULHU || - MULH_FAST && d_mulop == M_OP_MULHSU - ); - assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul) && !(x_muldiv_posted || x_stall_on_raw || x_muldiv_kill); @@ -668,7 +668,9 @@ if (EXTENSION_M) begin: has_muldiv if (MUL_FAST) begin: has_fast_mul - wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall; + // If MUL_FASTER is set, the multiplier produces its result + // combinatorially, so we never have to post a mul result to stage 3. + wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall && !MUL_FASTER; hazard3_mul_fast #( `include "hazard3_config_inst.vh" @@ -993,9 +995,10 @@ always @ (posedge clk or negedge rst_n) begin // - Steer captured read phase data in mw_result back through xm_result at end of AMO // - Make sure xm_result (store data) doesn't transition during stalled write dphase xm_result <= - d_csr_ren ? x_csr_rdata : - |EXTENSION_A && x_amo_phase == 3'h3 ? mw_result : - |EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result : + d_csr_ren ? x_csr_rdata : + |EXTENSION_A && x_amo_phase == 3'h3 ? mw_result : + |MUL_FASTER && x_use_fast_mul ? m_fast_mul_result : + |EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result : x_alu_result; xm_addr_align <= x_addr_sum[1:0]; end diff --git a/test/sim/tb_cxxrtl/Makefile b/test/sim/tb_cxxrtl/Makefile index 03d3a47..b72095c 100644 --- a/test/sim/tb_cxxrtl/Makefile +++ b/test/sim/tb_cxxrtl/Makefile @@ -16,6 +16,7 @@ PMP_REGIONS := 4 MULDIV_UNROLL := 2 MUL_FAST := 1 +MUL_FASTER := 1 MULH_FAST := 1 FAST_BRANCHCMP := 1 REDUCED_BYPASS := 0 @@ -43,6 +44,7 @@ SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP); SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP); SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP); SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP); +SYNTH_CMD += chparam -set MUL_FASTER $(MUL_FASTER) $(TOP); SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP); SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP); SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP);