Perf option for dedicated branch comparator

This commit is contained in:
Luke Wren 2022-04-02 11:40:47 +01:00
parent 3c61fae9ef
commit 7dc5046505
6 changed files with 73 additions and 3 deletions

View File

@ -0,0 +1,42 @@
/*****************************************************************************\
| Copyright (C) 2022 Luke Wren |
| SPDX-License-Identifier: Apache-2.0 |
\*****************************************************************************/
`default_nettype none
// The branch decision path through the ALU is slow because:
//
// - Sees immediates and PC on its inputs, as well as regs
// - Add/sub rather than just add (with complex decode of the sub condition)
// - 2 extra mux layers in front of adder if Zba extension is enabled
//
// So there is sometimes timing benefit to a dedicated branch comparator.
module hazard3_branchcmp #(
`include "hazard3_config.vh"
,
`include "hazard3_width_const.vh"
) (
input wire [W_ALUOP-1:0] aluop,
input wire [W_DATA-1:0] op_a,
input wire [W_DATA-1:0] op_b,
output wire cmp
);
`include "hazard3_ops.vh"
wire [W_DATA-1:0] diff = op_a - op_b;
wire cmp_is_unsigned = aluop[2]; // aluop == ALUOP_LTU;
wire lt = op_a[W_DATA-1] == op_b[W_DATA-1] ? diff[W_DATA-1] :
cmp_is_unsigned ? op_b[W_DATA-1] :
op_a[W_DATA-1] ;
// ALUOP_SUB is used for equality check by main ALU
assign cmp = aluop[0] ? op_a != op_b : lt;
endmodule
`default_nettype wire

View File

@ -2,6 +2,7 @@ file hazard3_core.v
file hazard3_cpu_1port.v
file hazard3_cpu_2port.v
file arith/hazard3_alu.v
file arith/hazard3_branchcmp.v
file arith/hazard3_muldiv_seq.v
file arith/hazard3_mul_fast.v
file arith/hazard3_priority_encode.v

View File

@ -114,6 +114,11 @@ parameter MUL_FAST = 0,
// Requires; MUL_FAST
parameter MULH_FAST = 0,
// FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch
// resolution, rather than using the ALU. May improve fetch address delay.
// (Especially if Zba extension is enabled)
parameter FAST_BRANCHCMP = 0,
// MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by
// making trap vector base partly fixed (legal, as it's WARL).
//

View File

@ -29,6 +29,7 @@
.MULDIV_UNROLL (MULDIV_UNROLL),
.MUL_FAST (MUL_FAST),
.MULH_FAST (MULH_FAST),
.FAST_BRANCHCMP (FAST_BRANCHCMP),
.MTVEC_WMASK (MTVEC_WMASK),
.W_ADDR (W_ADDR),
.W_DATA (W_DATA)

View File

@ -835,14 +835,33 @@ end
// For JALR, the LSB of the result must be cleared by hardware
wire [W_ADDR-1:0] x_jump_target = x_addr_sum & ~32'h1;
wire x_branch_cmp;
generate
if (~|FAST_BRANCHCMP) begin: alu_branchcmp
assign x_branch_cmp = x_alu_cmp;
end else begin: fast_branchcmp
hazard3_branchcmp #(
`include "hazard3_config_inst.vh"
) branchcmp_u (
.aluop (d_aluop),
.op_a (x_rs1_bypass),
.op_b (x_rs2_bypass),
.cmp (x_branch_cmp)
);
end
endgenerate
// Be careful not to take branches whose comparisons depend on a load result
assign x_jump_req = !x_stall_on_raw && (
d_branchcond == BCOND_ALWAYS ||
d_branchcond == BCOND_ZERO && !x_alu_cmp ||
d_branchcond == BCOND_NZERO && x_alu_cmp
d_branchcond == BCOND_ZERO && !x_branch_cmp ||
d_branchcond == BCOND_NZERO && x_branch_cmp
);
// ----------------------------------------------------------------------------
// Pipe Stage M

View File

@ -14,6 +14,7 @@ DEBUG_SUPPORT := 1
MULDIV_UNROLL := 2
MUL_FAST := 1
MULH_FAST := 1
FAST_BRANCHCMP := 1
REDUCED_BYPASS := 0
MVENDORID_VAL := 32'hdeadbeef
@ -38,6 +39,7 @@ SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP);
SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP);
SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP);
SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP);
SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP);
SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP);
SYNTH_CMD += chparam -set MIMPID_VAL $(MIMPID_VAL) $(TOP);
SYNTH_CMD += chparam -set MCONFIGPTR_VAL $(MCONFIGPTR_VAL) $(TOP);