Hazard3/hdl/arith/hazard3_muldiv_seq.v

296 lines
9.8 KiB
Verilog

/**********************************************************************
* DO WHAT THE FUCK YOU WANT TO AND DON'T BLAME US PUBLIC LICENSE *
* Version 3, April 2008 *
* *
* Copyright (C) 2018 Luke Wren *
* *
* Everyone is permitted to copy and distribute verbatim or modified *
* copies of this license document and accompanying software, and *
* changing either is allowed. *
* *
* TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION *
* *
* 0. You just DO WHAT THE FUCK YOU WANT TO. *
* 1. We're NOT RESPONSIBLE WHEN IT DOESN'T FUCKING WORK. *
* *
*********************************************************************/
// Combined multiply/divide/modulo circuit.
// All operations performed at 1 bit per clock; aiming for minimal resource usage.
// There are lots of opportunities for off-by-one errors here. See muldiv_model.py
// for a simple reference model of the mul/div/mod iterations.
//
// When op_kill is high, the current calculation halts immediately. op_vld can be
// asserted on the same cycle, and the new calculation begins without delay, regardless
// of op_rdy. This may be used by the processor on e.g. mispredict or trap.
//
// The actual multiply/divide hardware is unsigned. We handle signedness at
// input/output.
module hazard3_muldiv_seq #(
parameter XLEN = 32,
parameter UNROLL = 1,
parameter W_CTR = $clog2(XLEN + 1), // do not modify
`include "hazard3_width_const.vh"
) (
input wire clk,
input wire rst_n,
input wire [W_MULOP-1:0] op,
input wire op_vld,
output wire op_rdy,
input wire op_kill,
input wire [XLEN-1:0] op_a,
input wire [XLEN-1:0] op_b,
output wire [XLEN-1:0] result_h, // mulh* or rem*
output wire [XLEN-1:0] result_l, // mul or div*
output wire result_vld
);
`include "hazard3_ops.vh"
//synthesis translate_off
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL)
initial $fatal("%m: UNROLL must be a positive power of 2");
endgenerate
//synthesis translate_on
// ----------------------------------------------------------------------------
// Operation decode, operand sign adjustment
// On the first cycle, op_a and op_b go straight through to the accumulator
// and the divisor/multiplicand register. They are then adjusted in-place
// on the next cycle. This allows the same circuits to be reused for sign
// adjustment before output (and helps input timing).
reg [W_MULOP-1:0] op_r;
reg [2*XLEN-1:0] accum;
reg [XLEN-1:0] op_b_r;
reg op_a_neg_r;
reg op_b_neg_r;
wire op_a_signed =
op_r == M_OP_MULH ||
op_r == M_OP_MULHSU ||
op_r == M_OP_DIV ||
op_r == M_OP_REM;
wire op_b_signed =
op_r == M_OP_MULH ||
op_r == M_OP_DIV ||
op_r == M_OP_REM;
wire op_a_neg = op_a_signed && accum[XLEN-1];
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];
wire is_div = op_r[2];
// Controls for modifying sign of all/part of accumulator
wire accum_neg_l;
wire accum_inv_h;
wire accum_incr_h;
// ----------------------------------------------------------------------------
// Arithmetic circuit
// Combinatorials:
reg [2*XLEN-1:0] accum_next;
reg [2*XLEN-1:0] addend;
reg [2*XLEN-1:0] shift_tmp;
reg [2*XLEN-1:0] addsub_tmp;
reg neg_l_borrow;
always @ (*) begin: alu
integer i;
// Multiply/divide iteration layers
accum_next = accum;
addend = {2*XLEN{1'b0}};
addsub_tmp = {2*XLEN{1'b0}};
neg_l_borrow = 1'b0;
for (i = 0; i < UNROLL; i = i + 1) begin
addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
shift_tmp = is_div ? accum_next : accum_next >> 1;
addsub_tmp = shift_tmp + addend;
accum_next = (is_div ? !addsub_tmp[2 * XLEN - 1] : accum_next[0]) ?
addsub_tmp : shift_tmp;
if (is_div)
accum_next = {accum_next[2*XLEN-2:0], !addsub_tmp[2 * XLEN - 1]};
end
// Alternative path for negation of all/part of accumulator
if (accum_neg_l)
{neg_l_borrow, accum_next[XLEN-1:0]} = {~accum[XLEN-1:0]} + 1'b1;
if (accum_incr_h || accum_inv_h)
accum_next[XLEN +: XLEN] = (accum[XLEN +: XLEN] ^ {XLEN{accum_inv_h}})
+ accum_incr_h;
end
// ----------------------------------------------------------------------------
// Main state machine
reg sign_preadj_done;
reg [W_CTR-1:0] ctr;
reg sign_postadj_done;
reg sign_postadj_carry;
localparam CTR_TOP = XLEN[W_CTR-1:0];
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
ctr <= {W_CTR{1'b0}};
sign_preadj_done <= 1'b1;
sign_postadj_done <= 1'b1;
sign_postadj_carry <= 1'b0;
op_r <= {W_MULOP{1'b0}};
op_a_neg_r <= 1'b0;
op_b_neg_r <= 1'b0;
op_b_r <= {XLEN{1'b0}};
accum <= {XLEN*2{1'b0}};
end else if (op_kill || (op_vld && op_rdy)) begin
// Initialise circuit with operands + state
ctr <= op_vld ? CTR_TOP : {W_CTR{1'b0}};
sign_preadj_done <= !op_vld;
sign_postadj_done <= !op_vld;
sign_postadj_carry <= 1'b0;
op_r <= op;
op_b_r <= op_b;
accum <= {{XLEN{1'b0}}, op_a};
end else if (!sign_preadj_done) begin
// Pre-adjust sign if necessary, else perform first iteration immediately
op_a_neg_r <= op_a_neg;
op_b_neg_r <= op_b_neg;
sign_preadj_done <= 1'b1;
if (accum_neg_l || (op_b_neg ^ is_div)) begin
if (accum_neg_l)
accum[0 +: XLEN] <= accum_next[0 +: XLEN];
if (op_b_neg ^ is_div)
op_b_r <= -op_b_r;
end else begin
ctr <= ctr - UNROLL[W_CTR-1:0];
accum <= accum_next;
end
end else if (|ctr) begin
ctr <= ctr - UNROLL[W_CTR-1:0];
accum <= accum_next;
end else if (!sign_postadj_done || sign_postadj_carry) begin
sign_postadj_done <= 1'b1;
if (accum_inv_h || accum_incr_h)
accum[XLEN +: XLEN] <= accum_next[XLEN +: XLEN];
if (accum_neg_l) begin
accum[0 +: XLEN] <= accum_next[0 +: XLEN];
if (!is_div) begin
sign_postadj_carry <= neg_l_borrow;
sign_postadj_done <= !neg_l_borrow;
end
end
end
end
// ----------------------------------------------------------------------------
// Sign adjustment control
// Pre-adjustment: for any a, b we want |a|, |b|. Note that the magnitude of any
// 32-bit signed integer is representable by a 32-bit unsigned integer.
// Post-adjustment for division:
// We seek q, r to satisfy a = b * q + r, where a and b are given,
// and |r| < |b|. One way to do this is if
// sgn(r) = sgn(a)
// sgn(q) = sgn(a) ^ sgn(b)
// This has additional nice properties like
// -(a / b) = (-a) / b = a / (-b)
// Post-adjustment for multiplication:
// We have calculated the 2*XLEN result of |a| * |b|.
// Negate the entire accumulator if sgn(a) ^ sgn(b).
// This is done in two steps (to share div/mod circuit, and avoid 64-bit carry):
// - Negate lower half of accumulator, and invert upper half
// - Increment upper half if lower half carried
wire do_postadj = ~|{ctr, sign_postadj_done};
wire op_signs_differ = op_a_neg_r ^ op_b_neg_r;
assign accum_neg_l =
!sign_preadj_done && op_a_neg ||
do_postadj && !sign_postadj_carry && op_signs_differ && !(is_div && ~|op_b_r);
assign {accum_incr_h, accum_inv_h} =
do_postadj && is_div && op_a_neg_r ? 2'b11 :
do_postadj && !is_div && op_signs_differ && !sign_postadj_carry ? 2'b01 :
do_postadj && !is_div && op_signs_differ && sign_postadj_carry ? 2'b10 :
2'b00 ;
// ----------------------------------------------------------------------------
// Outputs
assign op_rdy = ~|{ctr, accum_neg_l, accum_incr_h, accum_inv_h};
assign result_vld = op_rdy;
`ifndef RISCV_FORMAL_ALTOPS
assign {result_h, result_l} = accum;
`else
// Provide arithmetically simpler alternative operations, to speed up formal checks
always assert(XLEN == 32); // TODO may care about this one day
reg [XLEN-1:0] fml_a_saved;
reg [XLEN-1:0] fml_b_saved;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
fml_a_saved <= {XLEN{1'b0}};
fml_b_saved <= {XLEN{1'b0}};
end else if (op_vld && op_rdy) begin
fml_a_saved <= op_a;
fml_b_saved <= op_b;
end
end
assign result_h =
op_r == M_OP_MULH ? (fml_a_saved + fml_b_saved) ^ 32'hf6583fb7 :
op_r == M_OP_MULHSU ? (fml_a_saved - fml_b_saved) ^ 32'hecfbe137 :
op_r == M_OP_MULHU ? (fml_a_saved + fml_b_saved) ^ 32'h949ce5e8 :
op_r == M_OP_REM ? (fml_a_saved - fml_b_saved) ^ 32'h8da68fa5 :
op_r == M_OP_REMU ? (fml_a_saved - fml_b_saved) ^ 32'h3138d0e1 : 32'hdeadbeef;
assign result_l =
op_r == M_OP_MUL ? (fml_a_saved + fml_b_saved) ^ 32'h5876063e :
op_r == M_OP_DIV ? (fml_a_saved - fml_b_saved) ^ 32'h7f8529ec :
op_r == M_OP_DIVU ? (fml_a_saved - fml_b_saved) ^ 32'h10e8fd70 : 32'hdeadbeef;
`endif
// ----------------------------------------------------------------------------
// Interface properties
`ifdef FORMAL
always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
integer i;
reg alive;
if ($past(op_rdy && !op_vld))
assert(op_rdy);
if (result_vld && $past(result_vld) && !$past(op_kill))
assert($stable({result_h, result_l}));
// Kill will halt an in-progress operation, but a new operation may be
// asserted simultaneously with kill.
if ($past(op_kill))
assert(op_rdy == !$past(op_vld));
// We should be periodically ready (liveness property), unless new operations
// are forced in immediately, simultaneous with a kill, in which case there
// is no intermediate ready state.
alive = op_rdy || (op_kill && op_vld);
for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1)
alive = alive || $past(op_rdy || (op_kill && op_vld), i);
assert(alive);
end
`endif
endmodule