300 lines
9.8 KiB
Verilog
300 lines
9.8 KiB
Verilog
/**********************************************************************
|
|
* DO WHAT THE FUCK YOU WANT TO AND DON'T BLAME US PUBLIC LICENSE *
|
|
* Version 3, April 2008 *
|
|
* *
|
|
* Copyright (C) 2018 Luke Wren *
|
|
* *
|
|
* Everyone is permitted to copy and distribute verbatim or modified *
|
|
* copies of this license document and accompanying software, and *
|
|
* changing either is allowed. *
|
|
* *
|
|
* TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION *
|
|
* *
|
|
* 0. You just DO WHAT THE FUCK YOU WANT TO. *
|
|
* 1. We're NOT RESPONSIBLE WHEN IT DOESN'T FUCKING WORK. *
|
|
* *
|
|
*********************************************************************/
|
|
|
|
// Combined multiply/divide/modulo circuit.
|
|
// All operations performed at 1 bit per clock; aiming for minimal resource usage.
|
|
// There are lots of opportunities for off-by-one errors here. See muldiv_model.py
|
|
// for a simple reference model of the mul/div/mod iterations.
|
|
//
|
|
// When op_kill is high, the current calculation halts immediately. op_vld can be
|
|
// asserted on the same cycle, and the new calculation begins without delay, regardless
|
|
// of op_rdy. This may be used by the processor on e.g. mispredict or trap.
|
|
//
|
|
// The actual multiply/divide hardware is unsigned. We handle signedness at
|
|
// input/output.
|
|
|
|
`default_nettype none
|
|
|
|
module hazard3_muldiv_seq #(
|
|
parameter XLEN = 32,
|
|
parameter UNROLL = 1,
|
|
parameter W_CTR = $clog2(XLEN + 1), // do not modify
|
|
`include "hazard3_width_const.vh"
|
|
) (
|
|
input wire clk,
|
|
input wire rst_n,
|
|
input wire [W_MULOP-1:0] op,
|
|
input wire op_vld,
|
|
output wire op_rdy,
|
|
input wire op_kill,
|
|
input wire [XLEN-1:0] op_a,
|
|
input wire [XLEN-1:0] op_b,
|
|
|
|
output wire [XLEN-1:0] result_h, // mulh* or rem*
|
|
output wire [XLEN-1:0] result_l, // mul or div*
|
|
output wire result_vld
|
|
);
|
|
|
|
`include "hazard3_ops.vh"
|
|
|
|
//synthesis translate_off
|
|
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL)
|
|
initial $fatal("%m: UNROLL must be a positive power of 2");
|
|
endgenerate
|
|
//synthesis translate_on
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Operation decode, operand sign adjustment
|
|
|
|
// On the first cycle, op_a and op_b go straight through to the accumulator
|
|
// and the divisor/multiplicand register. They are then adjusted in-place
|
|
// on the next cycle. This allows the same circuits to be reused for sign
|
|
// adjustment before output (and helps input timing).
|
|
|
|
reg [W_MULOP-1:0] op_r;
|
|
reg [2*XLEN-1:0] accum;
|
|
reg [XLEN-1:0] op_b_r;
|
|
reg op_a_neg_r;
|
|
reg op_b_neg_r;
|
|
|
|
wire op_a_signed =
|
|
op_r == M_OP_MULH ||
|
|
op_r == M_OP_MULHSU ||
|
|
op_r == M_OP_DIV ||
|
|
op_r == M_OP_REM;
|
|
|
|
wire op_b_signed =
|
|
op_r == M_OP_MULH ||
|
|
op_r == M_OP_DIV ||
|
|
op_r == M_OP_REM;
|
|
|
|
wire op_a_neg = op_a_signed && accum[XLEN-1];
|
|
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
|
|
|
|
wire is_div = op_r[2];
|
|
|
|
// Controls for modifying sign of all/part of accumulator
|
|
wire accum_neg_l;
|
|
wire accum_inv_h;
|
|
wire accum_incr_h;
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Arithmetic circuit
|
|
|
|
// Combinatorials:
|
|
reg [2*XLEN-1:0] accum_next;
|
|
reg [2*XLEN-1:0] addend;
|
|
reg [2*XLEN-1:0] shift_tmp;
|
|
reg [2*XLEN-1:0] addsub_tmp;
|
|
reg neg_l_borrow;
|
|
|
|
always @ (*) begin: alu
|
|
integer i;
|
|
// Multiply/divide iteration layers
|
|
accum_next = accum;
|
|
addend = {2*XLEN{1'b0}};
|
|
addsub_tmp = {2*XLEN{1'b0}};
|
|
neg_l_borrow = 1'b0;
|
|
for (i = 0; i < UNROLL; i = i + 1) begin
|
|
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
|
|
shift_tmp = is_div ? accum_next : accum_next >> 1;
|
|
addsub_tmp = shift_tmp + addend;
|
|
accum_next = (is_div ? !addsub_tmp[2 * XLEN - 1] : accum_next[0]) ?
|
|
addsub_tmp : shift_tmp;
|
|
if (is_div)
|
|
accum_next = {accum_next[2*XLEN-2:0], !addsub_tmp[2 * XLEN - 1]};
|
|
end
|
|
// Alternative path for negation of all/part of accumulator
|
|
if (accum_neg_l)
|
|
{neg_l_borrow, accum_next[XLEN-1:0]} = {~accum[XLEN-1:0]} + 1'b1;
|
|
if (accum_incr_h || accum_inv_h)
|
|
accum_next[XLEN +: XLEN] = (accum[XLEN +: XLEN] ^ {XLEN{accum_inv_h}})
|
|
+ accum_incr_h;
|
|
end
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Main state machine
|
|
|
|
reg sign_preadj_done;
|
|
reg [W_CTR-1:0] ctr;
|
|
reg sign_postadj_done;
|
|
reg sign_postadj_carry;
|
|
|
|
localparam CTR_TOP = XLEN[W_CTR-1:0];
|
|
|
|
always @ (posedge clk or negedge rst_n) begin
|
|
if (!rst_n) begin
|
|
ctr <= {W_CTR{1'b0}};
|
|
sign_preadj_done <= 1'b1;
|
|
sign_postadj_done <= 1'b1;
|
|
sign_postadj_carry <= 1'b0;
|
|
op_r <= {W_MULOP{1'b0}};
|
|
op_a_neg_r <= 1'b0;
|
|
op_b_neg_r <= 1'b0;
|
|
op_b_r <= {XLEN{1'b0}};
|
|
accum <= {XLEN*2{1'b0}};
|
|
end else if (op_kill || (op_vld && op_rdy)) begin
|
|
// Initialise circuit with operands + state
|
|
ctr <= op_vld ? CTR_TOP : {W_CTR{1'b0}};
|
|
sign_preadj_done <= !op_vld;
|
|
sign_postadj_done <= !op_vld;
|
|
sign_postadj_carry <= 1'b0;
|
|
op_r <= op;
|
|
op_b_r <= op_b;
|
|
accum <= {{XLEN{1'b0}}, op_a};
|
|
end else if (!sign_preadj_done) begin
|
|
// Pre-adjust sign if necessary, else perform first iteration immediately
|
|
op_a_neg_r <= op_a_neg;
|
|
op_b_neg_r <= op_b_neg;
|
|
sign_preadj_done <= 1'b1;
|
|
if (accum_neg_l || (op_b_neg ^ is_div)) begin
|
|
if (accum_neg_l)
|
|
accum[0 +: XLEN] <= accum_next[0 +: XLEN];
|
|
if (op_b_neg ^ is_div)
|
|
op_b_r <= -op_b_r;
|
|
end else begin
|
|
ctr <= ctr - UNROLL[W_CTR-1:0];
|
|
accum <= accum_next;
|
|
end
|
|
end else if (|ctr) begin
|
|
ctr <= ctr - UNROLL[W_CTR-1:0];
|
|
accum <= accum_next;
|
|
end else if (!sign_postadj_done || sign_postadj_carry) begin
|
|
sign_postadj_done <= 1'b1;
|
|
if (accum_inv_h || accum_incr_h)
|
|
accum[XLEN +: XLEN] <= accum_next[XLEN +: XLEN];
|
|
if (accum_neg_l) begin
|
|
accum[0 +: XLEN] <= accum_next[0 +: XLEN];
|
|
if (!is_div) begin
|
|
sign_postadj_carry <= neg_l_borrow;
|
|
sign_postadj_done <= !neg_l_borrow;
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Sign adjustment control
|
|
|
|
// Pre-adjustment: for any a, b we want |a|, |b|. Note that the magnitude of any
|
|
// 32-bit signed integer is representable by a 32-bit unsigned integer.
|
|
|
|
// Post-adjustment for division:
|
|
// We seek q, r to satisfy a = b * q + r, where a and b are given,
|
|
// and |r| < |b|. One way to do this is if
|
|
// sgn(r) = sgn(a)
|
|
// sgn(q) = sgn(a) ^ sgn(b)
|
|
// This has additional nice properties like
|
|
// -(a / b) = (-a) / b = a / (-b)
|
|
|
|
// Post-adjustment for multiplication:
|
|
// We have calculated the 2*XLEN result of |a| * |b|.
|
|
// Negate the entire accumulator if sgn(a) ^ sgn(b).
|
|
// This is done in two steps (to share div/mod circuit, and avoid 64-bit carry):
|
|
// - Negate lower half of accumulator, and invert upper half
|
|
// - Increment upper half if lower half carried
|
|
|
|
wire do_postadj = ~|{ctr, sign_postadj_done};
|
|
wire op_signs_differ = op_a_neg_r ^ op_b_neg_r;
|
|
|
|
assign accum_neg_l =
|
|
!sign_preadj_done && op_a_neg ||
|
|
do_postadj && !sign_postadj_carry && op_signs_differ && !(is_div && ~|op_b_r);
|
|
|
|
assign {accum_incr_h, accum_inv_h} =
|
|
do_postadj && is_div && op_a_neg_r ? 2'b11 :
|
|
do_postadj && !is_div && op_signs_differ && !sign_postadj_carry ? 2'b01 :
|
|
do_postadj && !is_div && op_signs_differ && sign_postadj_carry ? 2'b10 :
|
|
2'b00 ;
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Outputs
|
|
|
|
assign op_rdy = ~|{ctr, accum_neg_l, accum_incr_h, accum_inv_h};
|
|
assign result_vld = op_rdy;
|
|
|
|
`ifndef RISCV_FORMAL_ALTOPS
|
|
|
|
assign {result_h, result_l} = accum;
|
|
|
|
`else
|
|
|
|
// Provide arithmetically simpler alternative operations, to speed up formal checks
|
|
always assert(XLEN == 32);
|
|
|
|
reg [XLEN-1:0] fml_a_saved;
|
|
reg [XLEN-1:0] fml_b_saved;
|
|
|
|
always @ (posedge clk or negedge rst_n) begin
|
|
if (!rst_n) begin
|
|
fml_a_saved <= {XLEN{1'b0}};
|
|
fml_b_saved <= {XLEN{1'b0}};
|
|
end else if (op_vld && op_rdy) begin
|
|
fml_a_saved <= op_a;
|
|
fml_b_saved <= op_b;
|
|
end
|
|
end
|
|
|
|
assign result_h =
|
|
op_r == M_OP_MULH ? (fml_a_saved + fml_b_saved) ^ 32'hf6583fb7 :
|
|
op_r == M_OP_MULHSU ? (fml_a_saved - fml_b_saved) ^ 32'hecfbe137 :
|
|
op_r == M_OP_MULHU ? (fml_a_saved + fml_b_saved) ^ 32'h949ce5e8 :
|
|
op_r == M_OP_REM ? (fml_a_saved - fml_b_saved) ^ 32'h8da68fa5 :
|
|
op_r == M_OP_REMU ? (fml_a_saved - fml_b_saved) ^ 32'h3138d0e1 : 32'hdeadbeef;
|
|
|
|
assign result_l =
|
|
op_r == M_OP_MUL ? (fml_a_saved + fml_b_saved) ^ 32'h5876063e :
|
|
op_r == M_OP_DIV ? (fml_a_saved - fml_b_saved) ^ 32'h7f8529ec :
|
|
op_r == M_OP_DIVU ? (fml_a_saved - fml_b_saved) ^ 32'h10e8fd70 : 32'hdeadbeef;
|
|
|
|
`endif
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Interface properties
|
|
|
|
`ifdef FORMAL
|
|
|
|
always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
|
|
integer i;
|
|
reg alive;
|
|
|
|
if ($past(op_rdy && !op_vld))
|
|
assert(op_rdy);
|
|
|
|
if (result_vld && $past(result_vld) && !$past(op_kill))
|
|
assert($stable({result_h, result_l}));
|
|
|
|
// Kill will halt an in-progress operation, but a new operation may be
|
|
// asserted simultaneously with kill.
|
|
if ($past(op_kill))
|
|
assert(op_rdy == !$past(op_vld));
|
|
|
|
// We should be periodically ready (liveness property), unless new operations
|
|
// are forced in immediately, simultaneous with a kill, in which case there
|
|
// is no intermediate ready state.
|
|
alive = op_rdy || (op_kill && op_vld);
|
|
for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1)
|
|
alive = alive || $past(op_rdy || (op_kill && op_vld), i);
|
|
assert(alive);
|
|
end
|
|
|
|
`endif
|
|
|
|
endmodule
|
|
|
|
`default_nettype wire
|