/**********************************************************************
 * DO WHAT THE FUCK YOU WANT TO AND DON'T BLAME US PUBLIC LICENSE     *
 *                    Version 3, April 2008                           *
 *                                                                    *
 * Copyright (C) 2018 Luke Wren                                       *
 *                                                                    *
 * Everyone is permitted to copy and distribute verbatim or modified  *
 * copies of this license document and accompanying software, and     *
 * changing either is allowed.                                        *
 *                                                                    *
 *   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION  *
 *                                                                    *
 * 0. You just DO WHAT THE FUCK YOU WANT TO.                          *
 * 1. We're NOT RESPONSIBLE WHEN IT DOESN'T FUCKING WORK.             *
 *                                                                    *
 *********************************************************************/

// Combined multiply/divide/modulo circuit.
// All operations performed at 1 bit per clock; aiming for minimal resource usage.
// There are lots of opportunities for off-by-one errors here. See muldiv_model.py
// for a simple reference model of the mul/div/mod iterations.
//
// When op_kill is high, the current calculation halts immediately. op_vld can be
// asserted on the same cycle, and the new calculation begins without delay, regardless
// of op_rdy. This may be used by the processor on e.g. mispredict or trap.
//
// The actual multiply/divide hardware is unsigned. We handle signedness at
// input/output.

`default_nettype none

module hazard3_muldiv_seq #(
	parameter XLEN = 32,
	parameter UNROLL = 1,
	parameter W_CTR = $clog2(XLEN + 1), // do not modify
`include "hazard3_width_const.vh"
) (
	input  wire               clk,
	input  wire               rst_n,
	input  wire [W_MULOP-1:0] op,
	input  wire               op_vld,
	output wire               op_rdy,
	input  wire               op_kill,
	input  wire [XLEN-1:0]    op_a,
	input  wire [XLEN-1:0]    op_b,

	output wire [XLEN-1:0]    result_h, // mulh* or rem*
	output wire [XLEN-1:0]    result_l, // mul   or div*
	output wire               result_vld
);

`include "hazard3_ops.vh"

//synthesis translate_off
generate if (UNROLL & (UNROLL - 1) || ~|UNROLL)
	initial $fatal("%m: UNROLL must be a positive power of 2");
endgenerate
//synthesis translate_on

// ----------------------------------------------------------------------------
// Operation decode, operand sign adjustment

// On the first cycle, op_a and op_b go straight through to the accumulator
// and the divisor/multiplicand register. They are then adjusted in-place
// on the next cycle. This allows the same circuits to be reused for sign
// adjustment before output (and helps input timing).

reg [W_MULOP-1:0] op_r;
reg [2*XLEN-1:0]  accum;
reg [XLEN-1:0]    op_b_r;
reg               op_a_neg_r;
reg               op_b_neg_r;

wire op_a_signed =
	op_r == M_OP_MULH ||
	op_r == M_OP_MULHSU ||
	op_r == M_OP_DIV ||
	op_r == M_OP_REM;

wire op_b_signed =
	op_r == M_OP_MULH ||
	op_r == M_OP_DIV ||
	op_r == M_OP_REM;

wire op_a_neg = op_a_signed && accum[XLEN-1];
wire op_b_neg = op_b_signed && op_b_r[XLEN-1];

wire is_div = op_r[2];

// Controls for modifying sign of all/part of accumulator
wire accum_neg_l;
wire accum_inv_h;
wire accum_incr_h;

// ----------------------------------------------------------------------------
// Arithmetic circuit

// Combinatorials:
reg [2*XLEN-1:0] accum_next;
reg [2*XLEN-1:0] addend;
reg [2*XLEN-1:0] shift_tmp;
reg [2*XLEN-1:0] addsub_tmp;
reg              neg_l_borrow;

always @ (*) begin: alu
	integer i;
	// Multiply/divide iteration layers
	accum_next = accum;
	addend = {2*XLEN{1'b0}};
	addsub_tmp = {2*XLEN{1'b0}};
	neg_l_borrow = 1'b0;
	for (i = 0; i < UNROLL; i = i + 1) begin
		addend = {is_div && |op_b_r, op_b_r, {XLEN-1{1'b0}}};
		shift_tmp = is_div ? accum_next : accum_next >> 1;
		addsub_tmp = shift_tmp + addend;
		accum_next = (is_div ? !addsub_tmp[2 * XLEN - 1] : accum_next[0]) ?
			addsub_tmp : shift_tmp;
		if (is_div)
			accum_next = {accum_next[2*XLEN-2:0], !addsub_tmp[2 * XLEN - 1]};
	end
	// Alternative path for negation of all/part of accumulator
	if (accum_neg_l)
		{neg_l_borrow, accum_next[XLEN-1:0]} = {~accum[XLEN-1:0]} + 1'b1;
	if (accum_incr_h || accum_inv_h)
		accum_next[XLEN +: XLEN] = (accum[XLEN +: XLEN] ^ {XLEN{accum_inv_h}})
			+ accum_incr_h;
end

// ----------------------------------------------------------------------------
// Main state machine

reg sign_preadj_done;
reg [W_CTR-1:0] ctr;
reg sign_postadj_done;
reg sign_postadj_carry;

localparam CTR_TOP = XLEN[W_CTR-1:0];

always @ (posedge clk or negedge rst_n) begin
	if (!rst_n) begin
		ctr <= {W_CTR{1'b0}};
		sign_preadj_done <= 1'b1;
		sign_postadj_done <= 1'b1;
		sign_postadj_carry <= 1'b0;
		op_r <= {W_MULOP{1'b0}};
		op_a_neg_r <= 1'b0;
		op_b_neg_r <= 1'b0;
		op_b_r <= {XLEN{1'b0}};
		accum <= {XLEN*2{1'b0}};
	end else if (op_kill || (op_vld && op_rdy)) begin
		// Initialise circuit with operands + state
		ctr <= op_vld ? CTR_TOP : {W_CTR{1'b0}};
		sign_preadj_done <= !op_vld;
		sign_postadj_done <= !op_vld;
		sign_postadj_carry <= 1'b0;
		op_r <= op;
		op_b_r <= op_b;
		accum <= {{XLEN{1'b0}}, op_a};
	end else if (!sign_preadj_done) begin
		// Pre-adjust sign if necessary, else perform first iteration immediately
		op_a_neg_r <= op_a_neg;
		op_b_neg_r <= op_b_neg;
		sign_preadj_done <= 1'b1;
		if (accum_neg_l || (op_b_neg ^ is_div)) begin
			if (accum_neg_l)
				accum[0 +: XLEN] <= accum_next[0 +: XLEN];
			if (op_b_neg ^ is_div)
				op_b_r <= -op_b_r;
		end else begin
			ctr <= ctr - UNROLL[W_CTR-1:0];
			accum <= accum_next;
		end
	end else if (|ctr) begin
		ctr <= ctr - UNROLL[W_CTR-1:0];
		accum <= accum_next;
	end else if (!sign_postadj_done || sign_postadj_carry) begin
		sign_postadj_done <= 1'b1;
		if (accum_inv_h || accum_incr_h)
			accum[XLEN +: XLEN] <= accum_next[XLEN +: XLEN];
		if (accum_neg_l) begin
			accum[0 +: XLEN] <= accum_next[0 +: XLEN];
			if (!is_div) begin
				sign_postadj_carry <= neg_l_borrow;
				sign_postadj_done <= !neg_l_borrow;
			end
		end
	end
end

// ----------------------------------------------------------------------------
// Sign adjustment control

// Pre-adjustment: for any a, b we want |a|, |b|. Note that the magnitude of any
// 32-bit signed integer is representable by a 32-bit unsigned integer.

// Post-adjustment for division:
// We seek q, r to satisfy a = b * q + r, where a and b are given,
// and |r| < |b|. One way to do this is if
// sgn(r) = sgn(a)
// sgn(q) = sgn(a) ^ sgn(b)
// This has additional nice properties like
// -(a / b) = (-a) / b = a / (-b)

// Post-adjustment for multiplication:
// We have calculated the 2*XLEN result of |a| * |b|.
// Negate the entire accumulator if sgn(a) ^ sgn(b).
// This is done in two steps (to share div/mod circuit, and avoid 64-bit carry):
// - Negate lower half of accumulator, and invert upper half
// - Increment upper half if lower half carried

wire do_postadj = ~|{ctr, sign_postadj_done};
wire op_signs_differ = op_a_neg_r ^ op_b_neg_r;

assign accum_neg_l =
	!sign_preadj_done && op_a_neg ||
	do_postadj && !sign_postadj_carry && op_signs_differ && !(is_div && ~|op_b_r);

assign {accum_incr_h, accum_inv_h} =
	do_postadj &&  is_div && op_a_neg_r                             ? 2'b11 :
	do_postadj && !is_div && op_signs_differ && !sign_postadj_carry ? 2'b01 :
	do_postadj && !is_div && op_signs_differ &&  sign_postadj_carry ? 2'b10 :
	                                                                  2'b00 ;

// ----------------------------------------------------------------------------
// Outputs

assign op_rdy = ~|{ctr, accum_neg_l, accum_incr_h, accum_inv_h};
assign result_vld = op_rdy;

`ifndef RISCV_FORMAL_ALTOPS

assign {result_h, result_l} = accum;

`else

// Provide arithmetically simpler alternative operations, to speed up formal checks
always assert(XLEN == 32);

reg [XLEN-1:0] fml_a_saved;
reg [XLEN-1:0] fml_b_saved;

always @ (posedge clk or negedge rst_n) begin
	if (!rst_n) begin
		fml_a_saved <= {XLEN{1'b0}};
		fml_b_saved <= {XLEN{1'b0}};
	end else if (op_vld && op_rdy) begin
		fml_a_saved <= op_a;
		fml_b_saved <= op_b;
	end
end

assign result_h =
	op_r == M_OP_MULH   ? (fml_a_saved + fml_b_saved) ^ 32'hf6583fb7 :
	op_r == M_OP_MULHSU ? (fml_a_saved - fml_b_saved) ^ 32'hecfbe137 :
	op_r == M_OP_MULHU  ? (fml_a_saved + fml_b_saved) ^ 32'h949ce5e8 :
	op_r == M_OP_REM    ? (fml_a_saved - fml_b_saved) ^ 32'h8da68fa5 :
	op_r == M_OP_REMU   ? (fml_a_saved - fml_b_saved) ^ 32'h3138d0e1 : 32'hdeadbeef;

assign result_l =
	op_r == M_OP_MUL    ? (fml_a_saved + fml_b_saved) ^ 32'h5876063e :
	op_r == M_OP_DIV    ? (fml_a_saved - fml_b_saved) ^ 32'h7f8529ec :
	op_r == M_OP_DIVU   ? (fml_a_saved - fml_b_saved) ^ 32'h10e8fd70 : 32'hdeadbeef;

`endif

// ----------------------------------------------------------------------------
// Interface properties

`ifdef FORMAL

always @ (posedge clk) if (rst_n && $past(rst_n)) begin: properties
	integer i;
	reg alive;

	if ($past(op_rdy && !op_vld))
		assert(op_rdy);

	if (result_vld && $past(result_vld) && !$past(op_kill))
		assert($stable({result_h, result_l}));

	// Kill will halt an in-progress operation, but a new operation may be
	// asserted simultaneously with kill.
	if ($past(op_kill))
		assert(op_rdy == !$past(op_vld));

	// We should be periodically ready (liveness property), unless new operations
	// are forced in immediately, simultaneous with a kill, in which case there
	// is no intermediate ready state.
	alive = op_rdy || (op_kill && op_vld);
	for (i = 1; i <= XLEN / UNROLL + 3; i = i + 1)
		alive = alive || $past(op_rdy || (op_kill && op_vld), i);
	assert(alive);
end

`endif

endmodule

`default_nettype wire