Re-rewrite frontend to track the halfword-validity of in-flight transfers, and clean up the old cir_lock mechanism to be a modified flush

This commit is contained in:
Luke Wren 2022-06-12 20:53:53 +01:00
parent e3da922f8b
commit 26d54d0023
3 changed files with 129 additions and 150 deletions

View File

@ -100,7 +100,7 @@ wire [31:0] fd_cir;
wire [1:0] fd_cir_err;
wire [1:0] fd_cir_vld;
wire [1:0] df_cir_use;
wire df_cir_lock;
wire df_cir_flush_behind;
assign bus_aph_panic_i = 1'b0;
@ -132,7 +132,7 @@ hazard3_frontend #(
.cir_err (fd_cir_err),
.cir_vld (fd_cir_vld),
.cir_use (df_cir_use),
.cir_lock (df_cir_lock),
.cir_flush_behind (df_cir_flush_behind),
.predecode_rs1_coarse (f_rs1_coarse),
.predecode_rs2_coarse (f_rs2_coarse),
@ -200,7 +200,7 @@ hazard3_decode #(
.fd_cir_err (fd_cir_err),
.fd_cir_vld (fd_cir_vld),
.df_cir_use (df_cir_use),
.df_cir_lock (df_cir_lock),
.df_cir_flush_behind (df_cir_flush_behind),
.d_pc (d_pc),
.x_jump_not_except (x_jump_not_except),

View File

@ -17,7 +17,7 @@ module hazard3_decode #(
input wire [1:0] fd_cir_err,
input wire [1:0] fd_cir_vld,
output wire [1:0] df_cir_use,
output wire df_cir_lock,
output wire df_cir_flush_behind,
output wire [W_ADDR-1:0] d_pc,
input wire debug_mode,
@ -113,13 +113,16 @@ wire assert_cir_lock = jump_caused_by_d && d_stall;
wire deassert_cir_lock = !d_stall;
reg cir_lock_prev;
assign df_cir_lock = (cir_lock_prev && !deassert_cir_lock) || assert_cir_lock;
wire cir_lock = (cir_lock_prev && !deassert_cir_lock) || assert_cir_lock;
assign df_cir_flush_behind = assert_cir_lock && !cir_lock_prev;
always @ (posedge clk or negedge rst_n)
if (!rst_n)
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
cir_lock_prev <= 1'b0;
else
cir_lock_prev <= df_cir_lock;
end else begin
cir_lock_prev <= cir_lock;
end
end
reg [W_ADDR-1:0] pc;
wire [W_ADDR-1:0] pc_next = pc + (d_instr_is_32bit ? 32'h4 : 32'h2);
@ -143,7 +146,7 @@ always @ (posedge clk or negedge rst_n) begin
// if (cir_lock_prev && deassert_cir_lock)
// assert(f_jump_target == d_jump_target);
`endif
end else if (!d_stall && !df_cir_lock) begin
end else if (!d_stall && !cir_lock) begin
pc <= pc_next;
end
end

View File

@ -6,7 +6,6 @@
`default_nettype none
module hazard3_frontend #(
parameter FIFO_DEPTH = 2, // power of 2, >= 1
`include "hazard3_config.vh"
) (
input wire clk,
@ -41,17 +40,17 @@ module hazard3_frontend #(
// Note reg/wire distinction
// => decode is providing live feedback on the CIR it is decoding,
// which we fetched previously
// This works OK because size is decoded from 2 LSBs of instruction, so cheap.
output reg [31:0] cir,
output reg [1:0] cir_vld, // number of valid halfwords in CIR
input wire [1:0] cir_use, // number of halfwords D intends to consume
// *may* be a function of hready
output wire [1:0] cir_err, // Bus error on upper/lower halfword of CIR.
input wire cir_lock,// Lock-in current contents and level of CIR.
// Assert simultaneously with a jump request,
// if decode is going to stall. This stops the CIR
// from being trashed by incoming fetch data;
// jump instructions have other side effects besides jumping!
// "flush_behind": do not flush the oldest instruction when accepting a
// jump request (but still flush younger instructions). Sometimes a
// stalled instruction may assert a jump request, because e.g. the stall
// is dependent on a bus stall signal so can't gate the request.
input wire cir_flush_behind,
// Provide the rs1/rs2 register numbers which will be in CIR next cycle.
// Coarse: valid if this instruction has a nonzero register operand.
@ -76,67 +75,74 @@ module hazard3_frontend #(
`include "rv_opcodes.vh"
localparam W_BUNDLE = W_DATA / 2;
parameter W_FIFO_LEVEL = $clog2(FIFO_DEPTH + 1);
// This is the minimum for full throughput (enough to avoid dropping data when
// decode stalls) and there is no significant advantage to going larger.
localparam FIFO_DEPTH = 2;
// ----------------------------------------------------------------------------
// Fetch Queue (FIFO)
//
// This is a little different from either a normal sync fifo or sync fwft fifo
// so it's worth implementing from scratch
wire jump_now = jump_target_vld && jump_target_rdy;
reg [1:0] mem_data_hwvld;
// mem has an extra entry which is equal to next-but-last entry, and valid has
// an extra entry which is constant-0. These are just there to handle loop
// boundary conditions.
// Bus errors travel alongside data. They cause an exception if the core tries
// to decode the instruction, but until then can be flushed harmlessly.
// err has an error (HRESP) bit associated with each FIFO entry, so that we
// can correctly speculate and flush fetch errors. The error bit moves
// through the prefetch queue alongside the corresponding bus data. We sample
// bus errors like an extra data bit -- fetch continues to speculate forward
// past an error, and we eventually flush and redirect the frontend if an
// errored fetch makes it to the execute stage.
reg [W_DATA-1:0] fifo_mem [0:FIFO_DEPTH];
reg [FIFO_DEPTH:0] fifo_err;
reg [1:0] fifo_valid_hw [0:FIFO_DEPTH];
reg [FIFO_DEPTH:-1] fifo_valid;
reg [W_DATA-1:0] fifo_mem [0:FIFO_DEPTH];
reg [FIFO_DEPTH:0] fifo_err;
reg [FIFO_DEPTH:0] fifo_valid;
wire [W_DATA-1:0] fifo_rdata = fifo_mem[0];
wire fifo_full = fifo_valid[FIFO_DEPTH - 1];
wire fifo_empty = !fifo_valid[0];
wire fifo_almost_full = fifo_valid[FIFO_DEPTH - 2];
wire [W_DATA-1:0] fifo_wdata = mem_data;
wire [W_DATA-1:0] fifo_rdata = fifo_mem[0];
always @ (*) fifo_mem[FIFO_DEPTH] = fifo_wdata;
wire fifo_push;
wire fifo_pop;
wire fifo_dbg_inject = DEBUG_SUPPORT && dbg_instr_data_vld && dbg_instr_data_rdy;
wire fifo_full = fifo_valid[FIFO_DEPTH - 1];
wire fifo_empty = !fifo_valid[0];
wire fifo_almost_full = FIFO_DEPTH == 1 || (!fifo_valid[FIFO_DEPTH - 1] && fifo_valid[FIFO_DEPTH - 2]);
wire fifo_push;
wire fifo_pop;
wire fifo_dbg_inject = DEBUG_SUPPORT && dbg_instr_data_vld && dbg_instr_data_rdy;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
fifo_valid <= {FIFO_DEPTH+1{1'b0}};
end else if (jump_now) begin
fifo_valid <= {FIFO_DEPTH+1{1'b0}};
end else if (fifo_push || fifo_pop || fifo_dbg_inject) begin
fifo_valid <= {1'b0, ~(~fifo_valid << (fifo_push || fifo_dbg_inject)) >> fifo_pop};
always @ (*) begin: boundary_conditions
integer i;
fifo_mem[FIFO_DEPTH] = mem_data;
fifo_valid_hw[FIFO_DEPTH] = 2'b00;
fifo_valid[FIFO_DEPTH] = 1'b0;
fifo_valid[-1] = 1'b1;
for (i = 0; i < FIFO_DEPTH; i = i + 1) begin
fifo_valid[i] = |EXTENSION_C ? |fifo_valid_hw[i] : fifo_valid_hw[i][0];
end
end
always @ (posedge clk) begin: fifo_data_shift
always @ (posedge clk or negedge rst_n) begin: fifo_update
integer i;
for (i = 0; i < FIFO_DEPTH; i = i + 1) begin
if (fifo_pop || (fifo_push && !fifo_valid[i])) begin
fifo_mem[i] <= fifo_valid[i + 1] ? fifo_mem[i + 1] : fifo_wdata;
fifo_err[i] <= fifo_err[i + 1] ? fifo_err[i + 1] : mem_data_err;
if (!rst_n) begin
for (i = 0; i < FIFO_DEPTH; i = i + 1) begin
fifo_valid_hw[i] <= 2'b00;
fifo_mem[i] <= 32'h0;
fifo_err[i] <= 1'b0;
end
end else begin
for (i = 0; i < FIFO_DEPTH; i = i + 1) begin
if (fifo_pop || (fifo_push && !fifo_valid[i])) begin
fifo_mem[i] <= fifo_valid[i + 1] ? fifo_mem[i + 1] : mem_data;
fifo_err[i] <= fifo_valid[i + 1] ? fifo_err[i + 1] : mem_data_err;
end
fifo_valid_hw[i] <=
jump_now ? 2'h0 :
fifo_valid[i + 1] && fifo_pop ? fifo_valid_hw[i + 1] :
fifo_valid[i] && fifo_pop && !fifo_push ? 2'h0 :
fifo_valid[i] && fifo_pop && fifo_push ? mem_data_hwvld :
!fifo_valid[i] && fifo_valid[i - 1] && fifo_push && !fifo_pop ? mem_data_hwvld : fifo_valid_hw[i];
end
// Allow DM to inject instructions directly into the lowest-numbered queue
// entry. This mux should not extend critical path since it is balanced
// with the instruction-assembly muxes on the queue bypass path.
if (fifo_dbg_inject) begin
fifo_mem[0] <= dbg_instr_data;
fifo_err[0] <= 1'b0;
fifo_valid_hw[0] <= 2'b11;
end
end
// Allow DM to inject instructions directly into the lowest-numbered queue
// entry. This mux should not extend critical path since it is balanced
// with the instruction-assembly muxes on the queue bypass path.
if (fifo_dbg_inject) begin
fifo_mem[0] <= dbg_instr_data;
fifo_err[0] <= 1'b0;
end
end
@ -199,44 +205,26 @@ always @ (posedge clk or negedge rst_n) begin
end
end
wire unaligned_jump_now = EXTENSION_C && jump_now && jump_target[1];
reg unaligned_jump_dph;
reg [1:0] mem_aph_hwvld;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
unaligned_jump_dph <= 1'b0;
mem_data_hwvld <= 2'b11;
mem_aph_hwvld <= 2'b11;
end else if (EXTENSION_C) begin
if ((mem_data_vld && ~|ctr_flush_pending && !cir_lock)
|| (jump_now && !unaligned_jump_now)) begin
unaligned_jump_dph <= 1'b0;
end
if (fifo_pop) begin
// Following a lock/unlock of the CIR, we may have an unaligned fetch in
// the FIFO, rather than consuming straight from the bus.
unaligned_jump_dph <= 1'b0;
end
if (unaligned_jump_now) begin
unaligned_jump_dph <= 1'b1;
if (jump_now) begin
if (mem_addr_rdy) begin
mem_data_hwvld <= {1'b1, !jump_target[1]};
end else begin
mem_aph_hwvld <= {1'b1, !jump_target[1]};
end
end else if (mem_addr_vld && mem_addr_rdy) begin
mem_data_hwvld <= mem_aph_hwvld;
mem_aph_hwvld <= 2'b11;
end
end
end
`ifdef FORMAL
reg property_after_aligned_jump;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
property_after_aligned_jump <= 1'b0;
end else begin
property_after_aligned_jump <= jump_now && !jump_target[1];
if (property_after_aligned_jump) begin
// Make sure this clears properly (have been subtle historic bugs here)
assert(!unaligned_jump_dph);
end
end
end
`endif
// Combinatorially generate the address-phase request
reg reset_holdoff;
@ -296,19 +284,16 @@ assign jump_target_rdy = !mem_addr_hold;
reg [1:0] buf_level;
reg [W_BUNDLE-1:0] hwbuf;
// You might wonder why we have a 48-bit instruction shifter {hwbuf, cir}.
// What if we had a 32-bit shifter, and tracked halfword-valid status of the
// FIFO entries? This would fail in the following case:
//
// - Initially CIR and FIFO are full
// - Consume a 16-bit instruction from CIR
// - CIR is refilled and last FIFO entry becomes half-valid.
// - Now consume a 32-bit instruction from CIR
// - There is not enough data in the last FIFO entry to refill it
wire [W_DATA-1:0] fetch_data = fifo_empty ? mem_data : fifo_rdata;
wire [1:0] fetch_data_hwvld = fifo_empty ? mem_data_hwvld : fifo_valid_hw[0];
wire fetch_data_vld = !fifo_empty || (mem_data_vld && ~|ctr_flush_pending && !debug_mode);
wire [2*W_BUNDLE-1:0] fetch_data_aligned = {
fetch_data[W_BUNDLE +: W_BUNDLE],
fetch_data_hwvld[0] || ~|EXTENSION_C ?
fetch_data[0 +: W_BUNDLE] : fetch_data[W_BUNDLE +: W_BUNDLE]
};
// Shift any recycled instruction data down to backfill D's consumption
// We don't care about anything which is invalid or will be overlaid with fresh data,
// so choose these values in a way that minimises muxes
@ -317,52 +302,68 @@ wire [3*W_BUNDLE-1:0] instr_data_shifted =
cir_use[0] && EXTENSION_C ? {hwbuf, hwbuf, cir[W_BUNDLE +: W_BUNDLE]} :
{hwbuf, cir};
// Saturating subtraction: on cir_lock deassertion,
// buf_level will be 0 but cir_use will be positive!
wire [1:0] cir_use_clipped = |buf_level ? cir_use : 2'h0;
wire [1:0] level_next_no_fetch = buf_level - cir_use_clipped;
wire [1:0] level_next_no_fetch = buf_level - cir_use;
// Overlay fresh fetch data onto the shifted/recycled instruction data
// Again, if something won't be looked at, generate cheapest possible garbage.
// Don't care if fetch data is valid or not, as will just retry next cycle (as long as flags set correctly)
wire instr_fetch_overlay_blocked = cir_lock || (level_next_no_fetch[1] && !unaligned_jump_dph);
wire instr_fetch_overlay_blocked = level_next_no_fetch[1] && (~|EXTENSION_C || &fetch_data_hwvld);
wire [3*W_BUNDLE-1:0] instr_data_plus_fetch =
instr_fetch_overlay_blocked ? instr_data_shifted :
unaligned_jump_dph && EXTENSION_C ? {instr_data_shifted[W_BUNDLE +: 2*W_BUNDLE], fetch_data[W_BUNDLE +: W_BUNDLE]} :
level_next_no_fetch[0] && EXTENSION_C ? {fetch_data, instr_data_shifted[0 +: W_BUNDLE]} :
{instr_data_shifted[2*W_BUNDLE +: W_BUNDLE], fetch_data};
instr_fetch_overlay_blocked ? instr_data_shifted :
level_next_no_fetch[1] && |EXTENSION_C ? {fetch_data_aligned[0 +: W_BUNDLE], instr_data_shifted} :
level_next_no_fetch[0] && |EXTENSION_C ? {fetch_data_aligned, instr_data_shifted[0 +: W_BUNDLE]} :
{instr_data_shifted[2 * W_BUNDLE +: W_BUNDLE], fetch_data_aligned};
assign cir_must_refill = !cir_lock && !level_next_no_fetch[1];
// Also keep track of bus errors associated with CIR contents, shifted in the
// same way as instruction data. Errors may come straight from the bus, or
// may be buffered in the prefetch queue.
wire fetch_bus_err = fifo_empty ? mem_data_err : fifo_err[0];
reg [2:0] cir_bus_err;
wire [2:0] cir_bus_err_shifted =
cir_use[1] ? cir_bus_err >> 2 :
cir_use[0] && EXTENSION_C ? cir_bus_err >> 1 : cir_bus_err;
wire [2:0] cir_bus_err_plus_fetch =
instr_fetch_overlay_blocked ? cir_bus_err_shifted :
level_next_no_fetch[1] && |EXTENSION_C ? {fetch_bus_err, cir_bus_err_shifted[1:0]} :
level_next_no_fetch[0] && |EXTENSION_C ? {{2{fetch_bus_err}}, cir_bus_err_shifted[0]} :
{cir_bus_err_shifted[2], {2{fetch_bus_err}}};
assign cir_must_refill = !level_next_no_fetch[1];
assign fifo_pop = cir_must_refill && !fifo_empty;
wire [1:0] fetch_fill_amount = cir_must_refill && fetch_data_vld ? (
&fetch_data_hwvld ? 2'h2 : 2'h1
) : 2'h0;
wire [1:0] buf_level_next =
jump_now || |ctr_flush_pending || cir_lock ? 2'h0 :
fetch_data_vld && unaligned_jump_dph ? 2'h1 :
buf_level + {cir_must_refill && fetch_data_vld, 1'b0} - cir_use_clipped;
jump_now && cir_flush_behind ? (cir[1:0] == 2'b11 || ~|EXTENSION_C ? 2'h2 : 2'h1) :
jump_now ? 2'h0 : level_next_no_fetch + fetch_fill_amount;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
buf_level <= 2'h0;
cir_vld <= 2'h0;
hwbuf <= 16'h0;
cir <= 16'h0;
cir_bus_err <= 3'h0;
end else begin
`ifdef FORMAL
assert(cir_vld <= 2);
assert(cir_use <= cir_vld);
if (!jump_now)
assert(buf_level_next >= level_next_no_fetch);
`endif
// Update CIR flags
buf_level <= buf_level_next;
if (!cir_lock)
cir_vld <= buf_level_next & ~(buf_level_next >> 1'b1);
// Update CIR contents
cir_vld <= buf_level_next & ~(buf_level_next >> 1'b1);
cir_bus_err <= cir_bus_err_plus_fetch;
{hwbuf, cir} <= instr_data_plus_fetch;
end
end
// No need to reset these as they will be written before first use
always @ (posedge clk)
{hwbuf, cir} <= instr_data_plus_fetch;
`ifdef FORMAL
reg [1:0] property_past_buf_level; // Workaround for weird non-constant $past reset issue
always @ (posedge clk or negedge rst_n) begin
@ -377,31 +378,6 @@ always @ (posedge clk or negedge rst_n) begin
end
`endif
// Also keep track of bus errors associated with CIR contents, shifted in the
// same way as instruction data. Errors may come straight from the bus, or
// may be buffered in the prefetch queue.
wire fetch_bus_err = fifo_empty ? mem_data_err : fifo_err[0];
reg [2:0] cir_bus_err;
wire [2:0] cir_bus_err_shifted =
cir_use[1] ? cir_bus_err >> 2 :
cir_use[0] && EXTENSION_C ? cir_bus_err >> 1 : cir_bus_err;
wire [2:0] cir_bus_err_plus_fetch =
instr_fetch_overlay_blocked ? cir_bus_err_shifted :
unaligned_jump_dph && EXTENSION_C ? {cir_bus_err_shifted[2:1], fetch_bus_err} :
level_next_no_fetch && EXTENSION_C ? {{2{fetch_bus_err}}, cir_bus_err_shifted[0]} :
{cir_bus_err_shifted[2], {2{fetch_bus_err}}};
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
cir_bus_err <= 3'h0;
end else if (CSR_M_TRAP) begin
cir_bus_err <= cir_bus_err_plus_fetch;
end
end
assign cir_err = cir_bus_err[1:0];
// ----------------------------------------------------------------------------