/*****************************************************************************\ | Copyright (C) 2021-2022 Luke Wren | | SPDX-License-Identifier: Apache-2.0 | \*****************************************************************************/ // MUL-only (cfg: MUL_FAST) and MUL/MULH/MULHU/MULHSU (cfg: MUL_FAST && // MULH_FAST) are handled by different circuits. In either case it's a simple // behavioural multiply, and we rely on inference to get good performance on // FPGA. `default_nettype none module hazard3_mul_fast #( `include "hazard3_config.vh" , `include "hazard3_width_const.vh" ) ( input wire clk, input wire rst_n, input wire [W_MULOP-1:0] op, input wire op_vld, input wire [W_DATA-1:0] op_a, input wire [W_DATA-1:0] op_b, output wire [W_DATA-1:0] result, output reg result_vld ); `include "hazard3_ops.vh" localparam XLEN = W_DATA; //synthesis translate_off generate if (MULH_FAST && !MUL_FAST) initial $fatal("%m: MULH_FAST requires that MUL_FAST is also set."); endgenerate generate if (MUL_FASTER && !MUL_FAST) initial $fatal("%m: MUL_FASTER requires that MUL_FAST is also set."); endgenerate //synthesis translate_on // Latency of 1: always @ (posedge clk or negedge rst_n) begin if (!rst_n) begin result_vld <= 1'b0; end else begin result_vld <= op_vld; end end // ---------------------------------------------------------------------------- // Fast MUL only generate if (!MULH_FAST) begin: mul_only // This pipestage is folded into the front of the DSP tiles on UP5k. Note the // intention is to register the bypassed core regs at the end of X (since // bypass is quite slow), then perform multiply combinatorially in stage M, // and mux into MW result register. reg [XLEN-1:0] op_a_r; reg [XLEN-1:0] op_b_r; if (MUL_FASTER) begin: op_passthrough always @ (*) begin op_a_r = op_a; op_b_r = op_b; end end else begin: op_register always @ (posedge clk) begin if (op_vld) begin op_a_r <= op_a; op_b_r <= op_b; end end end // This should be inferred as 3 DSP tiles on UP5k: // // 1. Register then multiply a[15: 0] and b[15: 0] // 2. Register then multiply a[31:16] and b[15: 0], then directly add output of 1 // 3. Register then multiply a[15: 0] and b[31:16], then directly add output of 2 // // So there is quite a long path (1x 16-bit multiply, then 2x 16-bit add). On // other platforms you may just end up with a pile of gates. `ifndef RISCV_FORMAL_ALTOPS assign result = op_a_r * op_b_r; `else // riscv-formal can use a simpler function, since it's just confirming the // result is correctly hooked up. assign result = result_vld ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef; `endif // ---------------------------------------------------------------------------- // Fast MUL/MULH/MULHU/MULHSU end else begin: mul_and_mulh reg [XLEN-1:0] op_a_r; reg [XLEN-1:0] op_b_r; reg [W_MULOP-1:0] op_r; if (MUL_FASTER) begin: op_passthrough always @ (*) begin op_a_r = op_a; op_b_r = op_b; op_r = op; end end else begin: op_register always @ (posedge clk) begin if (op_vld) begin op_a_r <= op_a; op_b_r <= op_b; op_r <= op; end end end wire op_a_signed = op_r == M_OP_MULH || op_r == M_OP_MULHSU; wire op_b_signed = op_r == M_OP_MULH; wire [2*XLEN-1:0] op_a_sext = { {XLEN{op_a_r[XLEN - 1] && op_a_signed}}, op_a_r }; wire [2*XLEN-1:0] op_b_sext = { {XLEN{op_b_r[XLEN - 1] && op_b_signed}}, op_b_r }; wire [2*XLEN-1:0] result_full = op_a_sext * op_b_sext; `ifndef RISCV_FORMAL_ALTOPS assign result = op_r == M_OP_MUL ? result_full[0 +: XLEN] : result_full[XLEN +: XLEN]; `else assign result = op_r == M_OP_MULH ? (op_a_r + op_b_r) ^ 32'hf6583fb7 : op_r == M_OP_MULHSU ? (op_a_r - op_b_r) ^ 32'hecfbe137 : op_r == M_OP_MULHU ? (op_a_r + op_b_r) ^ 32'h949ce5e8 : op_r == M_OP_MUL ? (op_a_r + op_b_r) ^ 32'h5876063e : 32'hdeadbeef; `endif end endgenerate endmodule `ifndef YOSYS `default_nettype wire `endif