From c8afb4ac330407126af31b48099bcbf69cb15227 Mon Sep 17 00:00:00 2001 From: Luke Wren Date: Mon, 29 Nov 2021 18:48:02 +0000 Subject: [PATCH] Add option for fast high-half multiplies --- hdl/arith/hazard3_mul_fast.v | 99 +++++++++++++++++++++++++++++----- hdl/arith/hazard3_muldiv_seq.v | 33 +++++++----- hdl/hazard3_config.vh | 10 ++-- hdl/hazard3_config_inst.vh | 1 + hdl/hazard3_core.v | 17 +++--- test/sim/tb_cxxrtl/Makefile | 12 +++++ 6 files changed, 137 insertions(+), 35 deletions(-) diff --git a/hdl/arith/hazard3_mul_fast.v b/hdl/arith/hazard3_mul_fast.v index a8b28fb..da50f1f 100644 --- a/hdl/arith/hazard3_mul_fast.v +++ b/hdl/arith/hazard3_mul_fast.v @@ -15,21 +15,55 @@ * * *********************************************************************/ +// MUL-only (cfg: MUL_FAST) and MUL/MULH/MULHU/MULHSU (cfg: MUL_FAST && +// MULH_FAST) are handled by different circuits. In either case it's a simple +// behavioural multiply, and we rely on inference to get good performance on +// FPGA. + `default_nettype none module hazard3_mul_fast #( - parameter XLEN = 32 +`include "hazard3_config.vh" +, +`include "hazard3_width_const.vh" ) ( - input wire clk, - input wire rst_n, - input wire [XLEN-1:0] op_a, - input wire [XLEN-1:0] op_b, - input wire op_vld, + input wire clk, + input wire rst_n, - output wire [XLEN-1:0] result, + input wire [W_MULOP-1:0] op, + input wire op_vld, + input wire [W_DATA-1:0] op_a, + input wire [W_DATA-1:0] op_b, + + output wire [W_DATA-1:0] result, output reg result_vld ); +`include "hazard3_ops.vh" + +localparam XLEN = W_DATA; + +//synthesis translate_off +generate if (MULH_FAST && !MUL_FAST) + initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set."); +endgenerate +//synthesis translate_on + +// Latency of 1: +always @ (posedge clk or negedge rst_n) begin + if (!rst_n) begin + result_vld <= 1'b0; + end else begin + result_vld <= op_vld; + end +end + +// ---------------------------------------------------------------------------- +// Fast MUL only + +generate +if (!MULH_FAST) begin: mul_only + // This pipestage is folded into the front of the DSP tiles on UP5k. Note the // intention is to register the bypassed core regs at the end of X (since // bypass is quite slow), then perform multiply combinatorially in stage M, @@ -66,14 +100,55 @@ assign result = result_vld ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef; `endif -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - result_vld <= 1'b0; - end else begin - result_vld <= op_vld; +// ---------------------------------------------------------------------------- +// Fast MUL/MULH/MULHU/MULHSU + +end else begin: mul_and_mulh + +reg [XLEN-1:0] op_a_r; +reg [XLEN-1:0] op_b_r; +reg [W_MULOP-1:0] op_r; + +always @ (posedge clk) begin + if (op_vld) begin + op_a_r <= op_a; + op_b_r <= op_b; + op_r <= op; end end +wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU; +wire op_b_signed = op_r == M_OP_MULH; + +wire [2*XLEN-1:0] op_a_sext = { + {XLEN{op_a_r[XLEN - 1] && op_a_signed}}, + op_a_r +}; + +wire [2*XLEN-1:0] op_b_sext = { + {XLEN{op_b_r[XLEN - 1] && op_b_signed}}, + op_b_r +}; + +wire [2*XLEN-1:0] result_full = op_a_sext * op_b_sext; + +`ifndef RISCV_FORMAL_ALTOPS + +assign result = op_r == M_OP_MUL ? result_full[0 +: XLEN] : result_full[XLEN +: XLEN]; + +`else + +assign result = + op_r == M_OP_MULH ? (op_a_r + op_b_r) ^ 32'hf6583fb7 : + op_r == M_OP_MULHSU ? (op_a_r - op_b_r) ^ 32'hecfbe137 : + op_r == M_OP_MULHU ? (op_a_r + op_b_r) ^ 32'h949ce5e8 : + op_r == M_OP_MUL ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef; + +`endif + +end +endgenerate + endmodule `default_nettype wire diff --git a/hdl/arith/hazard3_muldiv_seq.v b/hdl/arith/hazard3_muldiv_seq.v index 10ec5a6..a2ba493 100644 --- a/hdl/arith/hazard3_muldiv_seq.v +++ b/hdl/arith/hazard3_muldiv_seq.v @@ -30,9 +30,8 @@ `default_nettype none module hazard3_muldiv_seq #( - parameter XLEN = 32, - parameter UNROLL = 1, - parameter W_CTR = $clog2(XLEN + 1), // do not modify +`include "hazard3_config.vh" +, `include "hazard3_width_const.vh" ) ( input wire clk, @@ -41,22 +40,25 @@ module hazard3_muldiv_seq #( input wire op_vld, output wire op_rdy, input wire op_kill, - input wire [XLEN-1:0] op_a, - input wire [XLEN-1:0] op_b, + input wire [W_DATA-1:0] op_a, + input wire [W_DATA-1:0] op_b, - output wire [XLEN-1:0] result_h, // mulh* or rem* - output wire [XLEN-1:0] result_l, // mul or div* + output wire [W_DATA-1:0] result_h, // mulh* or rem* + output wire [W_DATA-1:0] result_l, // mul or div* output wire result_vld ); `include "hazard3_ops.vh" //synthesis translate_off -generate if (UNROLL & (UNROLL - 1) || ~|UNROLL) - initial $fatal("%m: UNROLL must be a positive power of 2"); +generate if (MULDIV_UNROLL & (MULDIV_UNROLL - 1) || ~|MULDIV_UNROLL) + initial $fatal("%m: MULDIV_UNROLL must be a positive power of 2"); endgenerate //synthesis translate_on +localparam XLEN = W_DATA; +parameter W_CTR = $clog2(XLEN + 1); + // ---------------------------------------------------------------------------- // Operation decode, operand sign adjustment @@ -85,7 +87,10 @@ wire op_b_signed = wire op_a_neg = op_a_signed && accum[XLEN-1]; wire op_b_neg = op_b_signed && op_b_r[XLEN-1]; -wire is_div = op_r[2]; +// Non-divide parts of the circuit should be constant-folded if all the MUL +// operations are handled by the fast multiplier + +wire is_div = op_r[2] || (MUL_FAST && MULH_FAST); // Controls for modifying sign of all/part of accumulator wire accum_neg_l; @@ -109,7 +114,7 @@ always @ (*) begin: alu addend = {2*XLEN{1'b0}}; addsub_tmp = {2*XLEN{1'b0}}; neg_l_borrow = 1'b0; - for (i = 0; i < UNROLL; i = i + 1) begin + for (i = 0; i < MULDIV_UNROLL; i = i + 1) begin addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}}; shift_tmp = is_div ? accum_next : accum_next >> 1; addsub_tmp = shift_tmp + addend; @@ -167,11 +172,11 @@ always @ (posedge clk or negedge rst_n) begin if (op_b_neg ^ is_div) op_b_r <= -op_b_r; end else begin - ctr <= ctr - UNROLL[W_CTR-1:0]; + ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0]; accum <= accum_next; end end else if (|ctr) begin - ctr <= ctr - UNROLL[W_CTR-1:0]; + ctr <= ctr - MULDIV_UNROLL[W_CTR-1:0]; accum <= accum_next; end else if (!sign_postadj_done || sign_postadj_carry) begin sign_postadj_done <= 1'b1; @@ -287,7 +292,7 @@ always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties // are forced in immediately, simultaneous with a kill, in which case there // is no intermediate ready state. alive = op_rdy || (op_kill && op_vld); - for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1) + for (i = 1; i <= XLEN / MULDIV_UNROLL + 3; i = i + 1) alive = alive || $past(op_rdy || (op_kill && op_vld), i); assert(alive); end diff --git a/hdl/hazard3_config.vh b/hdl/hazard3_config.vh index a267274..3ddd470 100644 --- a/hdl/hazard3_config.vh +++ b/hdl/hazard3_config.vh @@ -94,10 +94,14 @@ parameter REDUCED_BYPASS = 0, parameter MULDIV_UNROLL = 1, // MUL_FAST: Use single-cycle multiply circuit for MUL instructions, retiring -// to stage M. The sequential multiply/divide circuit is still used for -// MULH/MULHU/MULHSU. +// to stage M. The sequential multiply/divide circuit is still used for MULH* parameter MUL_FAST = 0, +// MULH_FAST: extend the fast multiply circuit to also cover MULH*, and remove +// the multiply functionality from the sequential multiply/divide circuit. +// Requires; MUL_FAST +parameter MULH_FAST = 0, + // MTVEC_WMASK: Mask of which bits in MTVEC are modifiable. Save gates by // making trap vector base partly fixed (legal, as it's WARL). // @@ -105,7 +109,7 @@ parameter MUL_FAST = 0, // // - Note the entire vector table must always be aligned to its size, rounded // up to a power of two, so careful with the low-order bits. -parameter MTVEC_WMASK = 32'hffffffff, +parameter MTVEC_WMASK = 32'hfffffffd, // ---------------------------------------------------------------------------- // Port size parameters (do not modify) diff --git a/hdl/hazard3_config_inst.vh b/hdl/hazard3_config_inst.vh index 4982592..bad558c 100644 --- a/hdl/hazard3_config_inst.vh +++ b/hdl/hazard3_config_inst.vh @@ -21,6 +21,7 @@ .REDUCED_BYPASS (REDUCED_BYPASS), .MULDIV_UNROLL (MULDIV_UNROLL), .MUL_FAST (MUL_FAST), +.MULH_FAST (MULH_FAST), .MTVEC_WMASK (MTVEC_WMASK), .W_ADDR (W_ADDR), .W_DATA (W_DATA) diff --git a/hdl/hazard3_core.v b/hdl/hazard3_core.v index 692e99f..d7c169d 100644 --- a/hdl/hazard3_core.v +++ b/hdl/hazard3_core.v @@ -406,14 +406,18 @@ if (EXTENSION_M) begin: has_muldiv wire x_muldiv_kill = m_trap_enter_soon; - wire x_use_fast_mul = MUL_FAST && d_aluop == ALUOP_MULDIV && d_mulop == M_OP_MUL; + wire x_use_fast_mul = d_aluop == ALUOP_MULDIV && ( + MUL_FAST && d_mulop == M_OP_MUL || + MULH_FAST && d_mulop == M_OP_MULH || + MULH_FAST && d_mulop == M_OP_MULHU || + MULH_FAST && d_mulop == M_OP_MULHSU + ); assign x_muldiv_op_vld = (d_aluop == ALUOP_MULDIV && !x_use_fast_mul) && !(x_muldiv_posted || x_stall_raw || x_muldiv_kill); hazard3_muldiv_seq #( - .XLEN (W_DATA), - .UNROLL (MULDIV_UNROLL) + `include "hazard3_config_inst.vh" ) muldiv ( .clk (clk), .rst_n (rst_n), @@ -444,14 +448,15 @@ if (EXTENSION_M) begin: has_muldiv wire x_issue_fast_mul = x_use_fast_mul && |d_rd && !x_stall; hazard3_mul_fast #( - .XLEN(W_DATA) - ) inst_hazard3_mul_fast ( + `include "hazard3_config_inst.vh" + ) mul_fast ( .clk (clk), .rst_n (rst_n), + .op_vld (x_issue_fast_mul), + .op (d_mulop), .op_a (x_rs1_bypass), .op_b (x_rs2_bypass), - .op_vld (x_issue_fast_mul), .result (m_fast_mul_result), .result_vld (m_fast_mul_result_vld) diff --git a/test/sim/tb_cxxrtl/Makefile b/test/sim/tb_cxxrtl/Makefile index 2b84bd6..55bd6f6 100644 --- a/test/sim/tb_cxxrtl/Makefile +++ b/test/sim/tb_cxxrtl/Makefile @@ -2,11 +2,18 @@ TOP := hazard3_cpu_2port CDEFINES := DUAL_PORT CPU_RESET_VECTOR := 32'hc0 + EXTENSION_C := 1 EXTENSION_M := 1 +EXTENSION_ZBA := 1 +EXTENSION_ZBB := 1 +EXTENSION_ZBC := 1 +EXTENSION_ZBS := 1 DEBUG_SUPPORT := 0 + MULDIV_UNROLL := 2 MUL_FAST := 1 +MULH_FAST := 1 REDUCED_BYPASS := 0 .PHONY: clean tb all @@ -16,12 +23,17 @@ all: tb SYNTH_CMD += read_verilog -I ../../../hdl $(shell listfiles ../../../hdl/hazard3.f); SYNTH_CMD += chparam -set EXTENSION_C $(EXTENSION_C) $(TOP); SYNTH_CMD += chparam -set EXTENSION_M $(EXTENSION_M) $(TOP); +SYNTH_CMD += chparam -set EXTENSION_ZBA $(EXTENSION_ZBA) $(TOP); +SYNTH_CMD += chparam -set EXTENSION_ZBB $(EXTENSION_ZBB) $(TOP); +SYNTH_CMD += chparam -set EXTENSION_ZBC $(EXTENSION_ZBC) $(TOP); +SYNTH_CMD += chparam -set EXTENSION_ZBS $(EXTENSION_ZBS) $(TOP); SYNTH_CMD += chparam -set DEBUG_SUPPORT $(DEBUG_SUPPORT) $(TOP); SYNTH_CMD += chparam -set CSR_COUNTER 1 $(TOP); SYNTH_CMD += chparam -set RESET_VECTOR $(CPU_RESET_VECTOR) $(TOP); SYNTH_CMD += chparam -set REDUCED_BYPASS $(REDUCED_BYPASS) $(TOP); SYNTH_CMD += chparam -set MULDIV_UNROLL $(MULDIV_UNROLL) $(TOP); SYNTH_CMD += chparam -set MUL_FAST $(MUL_FAST) $(TOP); +SYNTH_CMD += chparam -set MULH_FAST $(MULH_FAST) $(TOP); SYNTH_CMD += write_cxxrtl dut.cpp dut.cpp: