From 7dc50465053b97773e8a21dc2aef4dca945fc8da Mon Sep 17 00:00:00 2001 From: Luke Wren Date: Sat, 2 Apr 2022 11:40:47 +0100 Subject: [PATCH] Perf option for dedicated branch comparator --- hdl/arith/hazard3_branchcmp.v | 42 +++++++++++++++++++++++++++++++++++ hdl/hazard3.f | 1 + hdl/hazard3_config.vh | 5 +++++ hdl/hazard3_config_inst.vh | 1 + hdl/hazard3_core.v | 25 ++++++++++++++++++--- test/sim/tb_cxxrtl/Makefile | 2 ++ 6 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 hdl/arith/hazard3_branchcmp.v diff --git a/hdl/arith/hazard3_branchcmp.v b/hdl/arith/hazard3_branchcmp.v new file mode 100644 index 0000000..dd849fd --- /dev/null +++ b/hdl/arith/hazard3_branchcmp.v @@ -0,0 +1,42 @@ +/*****************************************************************************\ +| Copyright (C) 2022 Luke Wren | +| SPDX-License-Identifier: Apache-2.0 | +\*****************************************************************************/ + +`default_nettype none + +// The branch decision path through the ALU is slow because: +// +// - Sees immediates and PC on its inputs, as well as regs +// - Add/sub rather than just add (with complex decode of the sub condition) +// - 2 extra mux layers in front of adder if Zba extension is enabled +// +// So there is sometimes timing benefit to a dedicated branch comparator. + +module hazard3_branchcmp #( +`include "hazard3_config.vh" +, +`include "hazard3_width_const.vh" +) ( + input wire [W_ALUOP-1:0] aluop, + input wire [W_DATA-1:0] op_a, + input wire [W_DATA-1:0] op_b, + output wire cmp +); + +`include "hazard3_ops.vh" + +wire [W_DATA-1:0] diff = op_a - op_b; + +wire cmp_is_unsigned = aluop[2]; // aluop == ALUOP_LTU; + +wire lt = op_a[W_DATA-1] == op_b[W_DATA-1] ? diff[W_DATA-1] : + cmp_is_unsigned ? op_b[W_DATA-1] : + op_a[W_DATA-1] ; + +// ALUOP_SUB is used for equality check by main ALU +assign cmp = aluop[0] ? op_a != op_b : lt; + +endmodule + +`default_nettype wire \ No newline at end of file diff --git a/hdl/hazard3.f b/hdl/hazard3.f index ca6051f..3bbc2cf 100644 --- a/hdl/hazard3.f +++ b/hdl/hazard3.f @@ -2,6 +2,7 @@ file hazard3_core.v file hazard3_cpu_1port.v file hazard3_cpu_2port.v file arith/hazard3_alu.v +file arith/hazard3_branchcmp.v file arith/hazard3_muldiv_seq.v file arith/hazard3_mul_fast.v file arith/hazard3_priority_encode.v diff --git a/hdl/hazard3_config.vh b/hdl/hazard3_config.vh index ea8c044..f1ba939 100644 --- a/hdl/hazard3_config.vh +++ b/hdl/hazard3_config.vh @@ -114,6 +114,11 @@ parameter MUL_FAST = 0, // Requires; MUL_FAST parameter MULH_FAST = 0, +// FAST_BRANCHCMP: Instantiate a separate comparator (eq/lt/ltu) for branch +// resolution, rather than using the ALU. May improve fetch address delay. +// (Especially if Zba extension is enabled) +parameter FAST_BRANCHCMP = 0, + // MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by // making trap vector base partly fixed (legal, as it's WARL). // diff --git a/hdl/hazard3_config_inst.vh b/hdl/hazard3_config_inst.vh index d6844b3..90c602c 100644 --- a/hdl/hazard3_config_inst.vh +++ b/hdl/hazard3_config_inst.vh @@ -29,6 +29,7 @@ .MULDIV_UNROLL (MULDIV_UNROLL), .MUL_FAST (MUL_FAST), .MULH_FAST (MULH_FAST), +.FAST_BRANCHCMP (FAST_BRANCHCMP), .MTVEC_WMASK (MTVEC_WMASK), .W_ADDR (W_ADDR), .W_DATA (W_DATA) diff --git a/hdl/hazard3_core.v b/hdl/hazard3_core.v index 96a531f..6c48ca2 100644 --- a/hdl/hazard3_core.v +++ b/hdl/hazard3_core.v @@ -835,14 +835,33 @@ end // For JALR, the LSB of the result must be cleared by hardware wire [W_ADDR-1:0] x_jump_target = x_addr_sum & ~32'h1; +wire x_branch_cmp; + +generate +if (~|FAST_BRANCHCMP) begin: alu_branchcmp + + assign x_branch_cmp = x_alu_cmp; + +end else begin: fast_branchcmp + + hazard3_branchcmp #( + `include "hazard3_config_inst.vh" + ) branchcmp_u ( + .aluop (d_aluop), + .op_a (x_rs1_bypass), + .op_b (x_rs2_bypass), + .cmp (x_branch_cmp) + ); + +end +endgenerate // Be careful not to take branches whose comparisons depend on a load result assign x_jump_req = !x_stall_on_raw && ( d_branchcond == BCOND_ALWAYS || - d_branchcond == BCOND_ZERO && !x_alu_cmp || - d_branchcond == BCOND_NZERO && x_alu_cmp + d_branchcond == BCOND_ZERO && !x_branch_cmp || + d_branchcond == BCOND_NZERO && x_branch_cmp ); - // ---------------------------------------------------------------------------- // Pipe Stage M diff --git a/test/sim/tb_cxxrtl/Makefile b/test/sim/tb_cxxrtl/Makefile index c5e68ca..854c2a0 100644 --- a/test/sim/tb_cxxrtl/Makefile +++ b/test/sim/tb_cxxrtl/Makefile @@ -14,6 +14,7 @@ DEBUG_SUPPORT := 1 MULDIV_UNROLL := 2 MUL_FAST := 1 MULH_FAST := 1 +FAST_BRANCHCMP := 1 REDUCED_BYPASS := 0 MVENDORID_VAL := 32'hdeadbeef @@ -38,6 +39,7 @@ SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP); SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP); SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP); SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP); +SYNTH_CMD += chparam -set FAST_BRANCHCMP $(FAST_BRANCHCMP) $(TOP); SYNTH_CMD += chparam -set MVENDORID_VAL $(MVENDORID_VAL) $(TOP); SYNTH_CMD += chparam -set MIMPID_VAL $(MIMPID_VAL) $(TOP); SYNTH_CMD += chparam -set MCONFIGPTR_VAL $(MCONFIGPTR_VAL) $(TOP);