From 23b4dbe7f3484f4dc711a7e6dfaa7ff268c72c0c Mon Sep 17 00:00:00 2001 From: Luke Wren Date: Sun, 12 Jun 2022 02:43:15 +0100 Subject: [PATCH] Redesign fetch queue: 2x32 + 3x16 -> 6x16. Should make it easier to support finer-grained flushing, and handle predicted branches cleanly. --- hdl/hazard3_core.v | 1 - hdl/hazard3_frontend.v | 255 +++++++----------- test/formal/frontend_fetch_match/tb.v | 8 +- test/formal/instruction_fetch_match/disasm.py | 6 +- .../hazard3_formal_regression.vh | 7 +- 5 files changed, 109 insertions(+), 168 deletions(-) diff --git a/hdl/hazard3_core.v b/hdl/hazard3_core.v index 2f3a5e2..680e344 100644 --- a/hdl/hazard3_core.v +++ b/hdl/hazard3_core.v @@ -108,7 +108,6 @@ wire f_mem_size; assign bus_hsize_i = f_mem_size ? HSIZE_WORD : HSIZE_HWORD; hazard3_frontend #( - .FIFO_DEPTH(2), `include "hazard3_config_inst.vh" ) frontend ( .clk (clk), diff --git a/hdl/hazard3_frontend.v b/hdl/hazard3_frontend.v index 860f3a7..34256f2 100644 --- a/hdl/hazard3_frontend.v +++ b/hdl/hazard3_frontend.v @@ -6,7 +6,6 @@ `default_nettype none module hazard3_frontend #( - parameter FIFO_DEPTH = 2, // power of 2, >= 1 `include "hazard3_config.vh" ) ( input wire clk, @@ -38,11 +37,7 @@ module hazard3_frontend #( output wire jump_target_rdy, // Interface to Decode - // Note reg/wire distinction - // => decode is providing live feedback on the CIR it is decoding, - // which we fetched previously - // This works OK because size is decoded from 2 LSBs of instruction, so cheap. - output reg [31:0] cir, + output wire [31:0] cir, output reg [1:0] cir_vld, // number of valid halfwords in CIR input wire [1:0] cir_use, // number of halfwords D intends to consume // *may* be a function of hready @@ -75,68 +70,115 @@ module hazard3_frontend #( `include "rv_opcodes.vh" +// This is the minimum size (in halfwords) for full fetch throughput, and +// there is little benefit to increasing it: +localparam FIFO_DEPTH = 6; + localparam W_BUNDLE = W_DATA / 2; -parameter W_FIFO_LEVEL = $clog2(FIFO_DEPTH + 1); // ---------------------------------------------------------------------------- // Fetch Queue (FIFO) -// -// This is a little different from either a normal sync fifo or sync fwft fifo -// so it's worth implementing from scratch wire jump_now = jump_target_vld && jump_target_rdy; -// mem has an extra entry which is equal to next-but-last entry, and valid has -// an extra entry which is constant-0. These are just there to handle loop -// boundary conditions. +// Note these registers have more than FIFO_DEPTH bits: these extras won't +// synthesise to registers, and are just there for loop boundary conditions. -// err has an error (HRESP) bit associated with each FIFO entry, so that we -// can correctly speculate and flush fetch errors. The error bit moves -// through the prefetch queue alongside the corresponding bus data. We sample -// bus errors like an extra data bit -- fetch continues to speculate forward -// past an error, and we eventually flush and redirect the frontend if an -// errored fetch makes it to the execute stage. +// Errors travel alongside data until the processor actually tries to decode +// an instruction whose fetch errored. Up until this point, errors can be +// flushed harmlessly. -reg [W_DATA-1:0] fifo_mem [0:FIFO_DEPTH]; -reg [FIFO_DEPTH:0] fifo_err; -reg [FIFO_DEPTH:0] fifo_valid; +reg [W_BUNDLE-1:0] fifo_mem [0:FIFO_DEPTH+1]; +reg [FIFO_DEPTH+1:0] fifo_err; +reg [FIFO_DEPTH+1:-1] fifo_valid; -wire [W_DATA-1:0] fifo_wdata = mem_data; -wire [W_DATA-1:0] fifo_rdata = fifo_mem[0]; -always @ (*) fifo_mem[FIFO_DEPTH] = fifo_wdata; +wire [1:0] mem_data_hwvalid; -wire fifo_full = fifo_valid[FIFO_DEPTH - 1]; wire fifo_empty = !fifo_valid[0]; -wire fifo_almost_full = FIFO_DEPTH == 1 || (!fifo_valid[FIFO_DEPTH - 1] && fifo_valid[FIFO_DEPTH - 2]); +// Full: will overflow after one 32b fetch. Almost full: after two of them. +wire fifo_full = fifo_valid[FIFO_DEPTH - 2]; +wire fifo_almost_full = fifo_valid[FIFO_DEPTH - 4]; wire fifo_push; -wire fifo_pop; wire fifo_dbg_inject = DEBUG_SUPPORT && dbg_instr_data_vld && dbg_instr_data_rdy; -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - fifo_valid <= {FIFO_DEPTH+1{1'b0}}; - end else if (jump_now) begin - fifo_valid <= {FIFO_DEPTH+1{1'b0}}; - end else if (fifo_push || fifo_pop || fifo_dbg_inject) begin - fifo_valid <= {1'b0, ~(~fifo_valid << (fifo_push || fifo_dbg_inject)) >> fifo_pop}; +// Boundary conditions +always @ (*) begin + fifo_mem[FIFO_DEPTH] = mem_data[31:16]; + fifo_mem[FIFO_DEPTH + 1] = mem_data[31:16]; + fifo_err[FIFO_DEPTH + 1 -: 2] = 2'b00; + fifo_valid[FIFO_DEPTH + 1 -: 2] = 2'b00; + fifo_valid[-1] = 1'b1; +end + +// Apply fetch, then shift out data consumed by decoder + +reg [W_BUNDLE-1:0] fifo_plus_fetch [0:FIFO_DEPTH+1]; +reg [W_BUNDLE-1:0] fifo_mem_next [0:FIFO_DEPTH-1]; +reg [FIFO_DEPTH+1:0] fifo_err_plus_fetch; +reg [FIFO_DEPTH-1:0] fifo_err_next; +reg [FIFO_DEPTH-1:0] fifo_valid_next; + +always @ (*) begin: fifo_shift + integer i; + for (i = 0; i < FIFO_DEPTH + 2; i = i + 1) begin + if (fifo_valid[i]) begin + fifo_plus_fetch[i] = fifo_mem[i]; + fifo_err_plus_fetch[i] = fifo_err[i]; + end else if (fifo_valid[i - 1] && mem_data_hwvalid[0]) begin + fifo_plus_fetch[i] = mem_data[0 * W_BUNDLE +: W_BUNDLE]; + fifo_err_plus_fetch[i] = mem_data_err; + end else begin + fifo_plus_fetch[i] = mem_data[1 * W_BUNDLE +: W_BUNDLE]; + fifo_err_plus_fetch[i] = mem_data_err; + end + end + for (i = 0; i < FIFO_DEPTH; i = i + 1) begin + if (cir_use[1]) begin + fifo_mem_next[i] = fifo_plus_fetch[i + 2]; + end else if (cir_use[0]) begin + fifo_mem_next[i] = fifo_plus_fetch[i + 1]; + end else begin + fifo_mem_next[i] = fifo_plus_fetch[i]; + end + end + if (jump_now) begin + fifo_err_next = {FIFO_DEPTH{1'b0}}; + if (cir_lock) begin + // Flush all but oldest instruction + fifo_valid_next = {{FIFO_DEPTH-2{1'b0}}, fifo_mem[0][1:0] == 2'b11, 1'b1}; + end else begin + fifo_valid_next = {FIFO_DEPTH{1'b0}}; + end + end else begin + fifo_valid_next = ~(~fifo_valid[FIFO_DEPTH-1:0] + << (fifo_push && mem_data_hwvalid[0]) + << (fifo_push && mem_data_hwvalid[1]) + ) >> cir_use; end end -always @ (posedge clk) begin: fifo_data_shift +// TODO: instruction injection. +// TODO: CIR locking. + +always @ (posedge clk or negedge rst_n) begin: fifo_update integer i; - for (i = 0; i < FIFO_DEPTH; i = i + 1) begin - if (fifo_pop || (fifo_push && !fifo_valid[i])) begin - fifo_mem[i] <= fifo_valid[i + 1] ? fifo_mem[i + 1] : fifo_wdata; - fifo_err[i] <= fifo_err[i + 1] ? fifo_err[i + 1] : mem_data_err; + if (!rst_n) begin + for (i = 0; i < FIFO_DEPTH; i = i + 1) begin + fifo_mem[i] <= {W_BUNDLE{1'b0}}; end - end - // Allow DM to inject instructions directly into the lowest-numbered queue - // entry. This mux should not extend critical path since it is balanced - // with the instruction-assembly muxes on the queue bypass path. - if (fifo_dbg_inject) begin - fifo_mem[0] <= dbg_instr_data; - fifo_err[0] <= 1'b0; + fifo_valid[FIFO_DEPTH-1:0] <= {FIFO_DEPTH{1'b0}}; + end else begin + // Don't clock registers whose contents we won't care about on the + // next cycle (note: inductively, if we don't care about it now we + // will never care about it until we write new data to it.) + for (i = 0; i < FIFO_DEPTH; i = i + 1) begin + if (fifo_valid_next[i]) begin + fifo_mem[i] <= fifo_mem_next[i]; + end + end + fifo_valid[FIFO_DEPTH-1:0] <= fifo_valid_next; + fifo_err[FIFO_DEPTH-1:0] <= fifo_err_next; end end @@ -153,9 +195,7 @@ wire [1:0] pending_fetches_next = pending_fetches + (mem_addr_vld && !mem_addr_h // Debugger only injects instructions when the frontend is at rest and empty. assign dbg_instr_data_rdy = DEBUG_SUPPORT && !fifo_valid[0] && ~|ctr_flush_pending; -wire cir_must_refill; -// If fetch data is forwarded past the FIFO, ensure it is not also written to it. -assign fifo_push = mem_data_vld && ~|ctr_flush_pending && !(cir_must_refill && fifo_empty) +assign fifo_push = mem_data_vld && ~|ctr_flush_pending && !(DEBUG_SUPPORT && debug_mode); always @ (posedge clk or negedge rst_n) begin @@ -206,15 +246,10 @@ always @ (posedge clk or negedge rst_n) begin if (!rst_n) begin unaligned_jump_dph <= 1'b0; end else if (EXTENSION_C) begin - if ((mem_data_vld && ~|ctr_flush_pending && !cir_lock) + if ((mem_data_vld && ~|ctr_flush_pending) || (jump_now && !unaligned_jump_now)) begin unaligned_jump_dph <= 1'b0; end - if (fifo_pop) begin - // Following a lock/unlock of the CIR, we may have an unaligned fetch in - // the FIFO, rather than consuming straight from the bus. - unaligned_jump_dph <= 1'b0; - end if (unaligned_jump_now) begin unaligned_jump_dph <= 1'b1; end @@ -236,6 +271,7 @@ always @ (posedge clk or negedge rst_n) begin end `endif +assign mem_data_hwvalid = {1'b1, !unaligned_jump_dph}; // Combinatorially generate the address-phase request @@ -246,7 +282,7 @@ always @ (posedge clk or negedge rst_n) begin end else begin reset_holdoff <= 1'b0; // This should be impossible, but assert to be sure, because it *will* - // change the fetch address (and we shouldn't check it in hardware if + // change the fetch address (and we can avoid checking in hardware if // we can prove it doesn't happen) `ifdef FORMAL assert(!(jump_target_vld && reset_holdoff)); @@ -292,122 +328,25 @@ assign jump_target_rdy = !mem_addr_hold; // ---------------------------------------------------------------------------- // Instruction assembly yard -// buf_level is the number of valid halfwords in {hwbuf, cir}. -reg [1:0] buf_level; -reg [W_BUNDLE-1:0] hwbuf; - -// You might wonder why we have a 48-bit instruction shifter {hwbuf, cir}. -// What if we had a 32-bit shifter, and tracked halfword-valid status of the -// FIFO entries? This would fail in the following case: -// -// - Initially CIR and FIFO are full -// - Consume a 16-bit instruction from CIR -// - CIR is refilled and last FIFO entry becomes half-valid. -// - Now consume a 32-bit instruction from CIR -// - There is not enough data in the last FIFO entry to refill it - -wire [W_DATA-1:0] fetch_data = fifo_empty ? mem_data : fifo_rdata; -wire fetch_data_vld = !fifo_empty || (mem_data_vld && ~|ctr_flush_pending && !debug_mode); - -// Shift any recycled instruction data down to backfill D's consumption -// We don't care about anything which is invalid or will be overlaid with fresh data, -// so choose these values in a way that minimises muxes -wire [3*W_BUNDLE-1:0] instr_data_shifted = - cir_use[1] ? {hwbuf, cir[W_BUNDLE +: W_BUNDLE], hwbuf} : - cir_use[0] && EXTENSION_C ? {hwbuf, hwbuf, cir[W_BUNDLE +: W_BUNDLE]} : - {hwbuf, cir}; - -// Saturating subtraction: on cir_lock deassertion, -// buf_level will be 0 but cir_use will be positive! -wire [1:0] cir_use_clipped = |buf_level ? cir_use : 2'h0; - -wire [1:0] level_next_no_fetch = buf_level - cir_use_clipped; - -// Overlay fresh fetch data onto the shifted/recycled instruction data -// Again, if something won't be looked at, generate cheapest possible garbage. -// Don't care if fetch data is valid or not, as will just retry next cycle (as long as flags set correctly) -wire instr_fetch_overlay_blocked = cir_lock || (level_next_no_fetch[1] && !unaligned_jump_dph); - -wire [3*W_BUNDLE-1:0] instr_data_plus_fetch = - instr_fetch_overlay_blocked ? instr_data_shifted : - unaligned_jump_dph && EXTENSION_C ? {instr_data_shifted[W_BUNDLE +: 2*W_BUNDLE], fetch_data[W_BUNDLE +: W_BUNDLE]} : - level_next_no_fetch[0] && EXTENSION_C ? {fetch_data, instr_data_shifted[0 +: W_BUNDLE]} : - {instr_data_shifted[2*W_BUNDLE +: W_BUNDLE], fetch_data}; - -assign cir_must_refill = !cir_lock && !level_next_no_fetch[1]; -assign fifo_pop = cir_must_refill && !fifo_empty; - -wire [1:0] buf_level_next = - jump_now || |ctr_flush_pending || cir_lock ? 2'h0 : - fetch_data_vld && unaligned_jump_dph ? 2'h1 : - buf_level + {cir_must_refill && fetch_data_vld, 1'b0} - cir_use_clipped; - always @ (posedge clk or negedge rst_n) begin if (!rst_n) begin - buf_level <= 2'h0; cir_vld <= 2'h0; end else begin `ifdef FORMAL assert(cir_vld <= 2); assert(cir_use <= cir_vld); `endif - // Update CIR flags - buf_level <= buf_level_next; - if (!cir_lock) - cir_vld <= buf_level_next & ~(buf_level_next >> 1'b1); - // Update CIR contents + cir_vld <= {fifo_valid_next[1], fifo_valid_next[0] && !fifo_valid_next[1]}; end end -// No need to reset these as they will be written before first use -always @ (posedge clk) - {hwbuf, cir} <= instr_data_plus_fetch; - -`ifdef FORMAL -reg [1:0] property_past_buf_level; // Workaround for weird non-constant $past reset issue -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - property_past_buf_level <= 2'h0; - end else begin - property_past_buf_level <= buf_level; - // We fetch 32 bits per cycle, max. If this happens it's due to negative overflow. - if (property_past_buf_level == 2'h0) - assert(buf_level != 2'h3); - end -end -`endif - -// Also keep track of bus errors associated with CIR contents, shifted in the -// same way as instruction data. Errors may come straight from the bus, or -// may be buffered in the prefetch queue. - -wire fetch_bus_err = fifo_empty ? mem_data_err : fifo_err[0]; - -reg [2:0] cir_bus_err; -wire [2:0] cir_bus_err_shifted = - cir_use[1] ? cir_bus_err >> 2 : - cir_use[0] && EXTENSION_C ? cir_bus_err >> 1 : cir_bus_err; - -wire [2:0] cir_bus_err_plus_fetch = - instr_fetch_overlay_blocked ? cir_bus_err_shifted : - unaligned_jump_dph && EXTENSION_C ? {cir_bus_err_shifted[2:1], fetch_bus_err} : - level_next_no_fetch && EXTENSION_C ? {{2{fetch_bus_err}}, cir_bus_err_shifted[0]} : - {cir_bus_err_shifted[2], {2{fetch_bus_err}}}; - -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - cir_bus_err <= 3'h0; - end else if (CSR_M_TRAP) begin - cir_bus_err <= cir_bus_err_plus_fetch; - end -end - -assign cir_err = cir_bus_err[1:0]; +assign cir = {fifo_mem[1], fifo_mem[0]}; +assign cir_err = fifo_err[1:0]; // ---------------------------------------------------------------------------- // Register number predecode -wire [31:0] next_instr = instr_data_plus_fetch[31:0]; +wire [31:0] next_instr = {fifo_mem_next[1], fifo_mem_next[0]}; wire next_instr_is_32bit = next_instr[1:0] == 2'b11 || ~|EXTENSION_C; always @ (*) begin diff --git a/test/formal/frontend_fetch_match/tb.v b/test/formal/frontend_fetch_match/tb.v index 6e4b31e..c6ee61a 100644 --- a/test/formal/frontend_fetch_match/tb.v +++ b/test/formal/frontend_fetch_match/tb.v @@ -41,7 +41,7 @@ always @ (posedge clk) (*keep*) wire [1:0] cir_vld; (*keep*) wire [1:0] cir_use; (*keep*) wire [1:0] cir_err; -(*keep*) wire cir_lock; +(*keep*) reg cir_lock; (*keep*) wire [4:0] predecode_rs1_coarse; (*keep*) wire [4:0] predecode_rs2_coarse; @@ -134,6 +134,7 @@ assign jump_target[0] = 1'b0; // assert on it inside the frontend. always @ (posedge clk) assume(!(jump_target_vld && !$past(rst_n))); + // ---------------------------------------------------------------------------- // Properties @@ -162,9 +163,4 @@ always @ (posedge clk) if (rst_n) begin end end -// FIXME remove - -always assume(jump_target < 100); -always assume(pc < 100); - endmodule \ No newline at end of file diff --git a/test/formal/instruction_fetch_match/disasm.py b/test/formal/instruction_fetch_match/disasm.py index c1938d3..9119364 100755 --- a/test/formal/instruction_fetch_match/disasm.py +++ b/test/formal/instruction_fetch_match/disasm.py @@ -10,5 +10,7 @@ with open("disasm.s", "w") as f: for instr in prog: f.write(f".word {instr}") -system("riscv32-unknown-elf-gcc -c disasm.s") -system("riscv32-unknown-elf-objdump -d -M numeric,no-aliases disasm.o") +system("riscv32-unknown-elf-gcc -march=rv32imac_zicsr_zba_zbb_zbc_zbs_zbkb_zifencei -c disasm.s") +# -d fails to disassemble compressed instructions (sometimes?) so use -D -j .text instead +system("riscv32-unknown-elf-objdump -D -j .text -M numeric,no-aliases disasm.o") + diff --git a/test/formal/instruction_fetch_match/hazard3_formal_regression.vh b/test/formal/instruction_fetch_match/hazard3_formal_regression.vh index 3f80aba..e239a81 100644 --- a/test/formal/instruction_fetch_match/hazard3_formal_regression.vh +++ b/test/formal/instruction_fetch_match/hazard3_formal_regression.vh @@ -56,10 +56,15 @@ wire [31:0] expect_cir; assign expect_cir[15:0 ] = instr_mem[ d_pc / 4] >> (d_pc[1] ? 16 : 0 ); assign expect_cir[31:16] = instr_mem[(d_pc + 2) / 4] >> (d_pc[1] ? 0 : 16); +// Note we can get a mismatch in the upper halfword during a CIR lock of a +// jump in the lower halfword -- the fetch will tumble into the top and this +// is fine as long as we correctly update the PC when the lock clears. +wire allow_upper_half_mismatch = fd_cir[15:0] == expect_cir[15:0] && fd_cir[1:0] != 2'b11; + always @ (posedge clk) if (rst_n) begin if (fd_cir_vld >= 2'd1) assert(fd_cir[15:0] == expect_cir[15:0]); - if (fd_cir_vld >= 2'd2 && d_pc <= MEM_SIZE_BYTES - 4) + if (fd_cir_vld >= 2'd2 && d_pc <= MEM_SIZE_BYTES - 4 && !allow_upper_half_mismatch) assert(fd_cir[31:16] == expect_cir[31:16]); end