diff --git a/hdl/hazard3_core.v b/hdl/hazard3_core.v index 680e344..780c6cc 100644 --- a/hdl/hazard3_core.v +++ b/hdl/hazard3_core.v @@ -100,7 +100,7 @@ wire [31:0] fd_cir; wire [1:0] fd_cir_err; wire [1:0] fd_cir_vld; wire [1:0] df_cir_use; -wire df_cir_lock; +wire df_cir_flush_behind; assign bus_aph_panic_i = 1'b0; @@ -132,7 +132,7 @@ hazard3_frontend #( .cir_err (fd_cir_err), .cir_vld (fd_cir_vld), .cir_use (df_cir_use), - .cir_lock (df_cir_lock), + .cir_flush_behind (df_cir_flush_behind), .predecode_rs1_coarse (f_rs1_coarse), .predecode_rs2_coarse (f_rs2_coarse), @@ -200,7 +200,7 @@ hazard3_decode #( .fd_cir_err (fd_cir_err), .fd_cir_vld (fd_cir_vld), .df_cir_use (df_cir_use), - .df_cir_lock (df_cir_lock), + .df_cir_flush_behind (df_cir_flush_behind), .d_pc (d_pc), .x_jump_not_except (x_jump_not_except), diff --git a/hdl/hazard3_decode.v b/hdl/hazard3_decode.v index 3bab019..0c76afb 100644 --- a/hdl/hazard3_decode.v +++ b/hdl/hazard3_decode.v @@ -17,7 +17,7 @@ module hazard3_decode #( input wire [1:0] fd_cir_err, input wire [1:0] fd_cir_vld, output wire [1:0] df_cir_use, - output wire df_cir_lock, + output wire df_cir_flush_behind, output wire [W_ADDR-1:0] d_pc, input wire debug_mode, @@ -113,13 +113,16 @@ wire assert_cir_lock = jump_caused_by_d && d_stall; wire deassert_cir_lock = !d_stall; reg cir_lock_prev; -assign df_cir_lock = (cir_lock_prev && !deassert_cir_lock) || assert_cir_lock; +wire cir_lock = (cir_lock_prev && !deassert_cir_lock) || assert_cir_lock; +assign df_cir_flush_behind = assert_cir_lock && !cir_lock_prev; -always @ (posedge clk or negedge rst_n) - if (!rst_n) +always @ (posedge clk or negedge rst_n) begin + if (!rst_n) begin cir_lock_prev <= 1'b0; - else - cir_lock_prev <= df_cir_lock; + end else begin + cir_lock_prev <= cir_lock; + end +end reg [W_ADDR-1:0] pc; wire [W_ADDR-1:0] pc_next = pc + (d_instr_is_32bit ? 32'h4 : 32'h2); @@ -143,7 +146,7 @@ always @ (posedge clk or negedge rst_n) begin // if (cir_lock_prev && deassert_cir_lock) // assert(f_jump_target == d_jump_target); `endif - end else if (!d_stall && !df_cir_lock) begin + end else if (!d_stall && !cir_lock) begin pc <= pc_next; end end diff --git a/hdl/hazard3_frontend.v b/hdl/hazard3_frontend.v index 860f3a7..99e9cfb 100644 --- a/hdl/hazard3_frontend.v +++ b/hdl/hazard3_frontend.v @@ -6,7 +6,6 @@ `default_nettype none module hazard3_frontend #( - parameter FIFO_DEPTH = 2, // power of 2, >= 1 `include "hazard3_config.vh" ) ( input wire clk, @@ -41,17 +40,17 @@ module hazard3_frontend #( // Note reg/wire distinction // => decode is providing live feedback on the CIR it is decoding, // which we fetched previously - // This works OK because size is decoded from 2 LSBs of instruction, so cheap. output reg [31:0] cir, output reg [1:0] cir_vld, // number of valid halfwords in CIR input wire [1:0] cir_use, // number of halfwords D intends to consume // *may* be a function of hready output wire [1:0] cir_err, // Bus error on upper/lower halfword of CIR. - input wire cir_lock,// Lock-in current contents and level of CIR. - // Assert simultaneously with a jump request, - // if decode is going to stall. This stops the CIR - // from being trashed by incoming fetch data; - // jump instructions have other side effects besides jumping! + + // "flush_behind": do not flush the oldest instruction when accepting a + // jump request (but still flush younger instructions). Sometimes a + // stalled instruction may assert a jump request, because e.g. the stall + // is dependent on a bus stall signal so can't gate the request. + input wire cir_flush_behind, // Provide the rs1/rs2 register numbers which will be in CIR next cycle. // Coarse: valid if this instruction has a nonzero register operand. @@ -76,67 +75,74 @@ module hazard3_frontend #( `include "rv_opcodes.vh" localparam W_BUNDLE = W_DATA / 2; -parameter W_FIFO_LEVEL = $clog2(FIFO_DEPTH + 1); +// This is the minimum for full throughput (enough to avoid dropping data when +// decode stalls) and there is no significant advantage to going larger. +localparam FIFO_DEPTH = 2; // ---------------------------------------------------------------------------- // Fetch Queue (FIFO) -// -// This is a little different from either a normal sync fifo or sync fwft fifo -// so it's worth implementing from scratch wire jump_now = jump_target_vld && jump_target_rdy; +reg [1:0] mem_data_hwvld; -// mem has an extra entry which is equal to next-but-last entry, and valid has -// an extra entry which is constant-0. These are just there to handle loop -// boundary conditions. +// Bus errors travel alongside data. They cause an exception if the core tries +// to decode the instruction, but until then can be flushed harmlessly. -// err has an error (HRESP) bit associated with each FIFO entry, so that we -// can correctly speculate and flush fetch errors. The error bit moves -// through the prefetch queue alongside the corresponding bus data. We sample -// bus errors like an extra data bit -- fetch continues to speculate forward -// past an error, and we eventually flush and redirect the frontend if an -// errored fetch makes it to the execute stage. +reg [W_DATA-1:0] fifo_mem [0:FIFO_DEPTH]; +reg [FIFO_DEPTH:0] fifo_err; +reg [1:0] fifo_valid_hw [0:FIFO_DEPTH]; +reg [FIFO_DEPTH:-1] fifo_valid; -reg [W_DATA-1:0] fifo_mem [0:FIFO_DEPTH]; -reg [FIFO_DEPTH:0] fifo_err; -reg [FIFO_DEPTH:0] fifo_valid; +wire [W_DATA-1:0] fifo_rdata = fifo_mem[0]; +wire fifo_full = fifo_valid[FIFO_DEPTH - 1]; +wire fifo_empty = !fifo_valid[0]; +wire fifo_almost_full = fifo_valid[FIFO_DEPTH - 2]; -wire [W_DATA-1:0] fifo_wdata = mem_data; -wire [W_DATA-1:0] fifo_rdata = fifo_mem[0]; -always @ (*) fifo_mem[FIFO_DEPTH] = fifo_wdata; +wire fifo_push; +wire fifo_pop; +wire fifo_dbg_inject = DEBUG_SUPPORT && dbg_instr_data_vld && dbg_instr_data_rdy; -wire fifo_full = fifo_valid[FIFO_DEPTH - 1]; -wire fifo_empty = !fifo_valid[0]; -wire fifo_almost_full = FIFO_DEPTH == 1 || (!fifo_valid[FIFO_DEPTH - 1] && fifo_valid[FIFO_DEPTH - 2]); -wire fifo_push; -wire fifo_pop; -wire fifo_dbg_inject = DEBUG_SUPPORT && dbg_instr_data_vld && dbg_instr_data_rdy; - -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - fifo_valid <= {FIFO_DEPTH+1{1'b0}}; - end else if (jump_now) begin - fifo_valid <= {FIFO_DEPTH+1{1'b0}}; - end else if (fifo_push || fifo_pop || fifo_dbg_inject) begin - fifo_valid <= {1'b0, ~(~fifo_valid << (fifo_push || fifo_dbg_inject)) >> fifo_pop}; +always @ (*) begin: boundary_conditions + integer i; + fifo_mem[FIFO_DEPTH] = mem_data; + fifo_valid_hw[FIFO_DEPTH] = 2'b00; + fifo_valid[FIFO_DEPTH] = 1'b0; + fifo_valid[-1] = 1'b1; + for (i = 0; i < FIFO_DEPTH; i = i + 1) begin + fifo_valid[i] = |EXTENSION_C ? |fifo_valid_hw[i] : fifo_valid_hw[i][0]; end end -always @ (posedge clk) begin: fifo_data_shift +always @ (posedge clk or negedge rst_n) begin: fifo_update integer i; - for (i = 0; i < FIFO_DEPTH; i = i + 1) begin - if (fifo_pop || (fifo_push && !fifo_valid[i])) begin - fifo_mem[i] <= fifo_valid[i + 1] ? fifo_mem[i + 1] : fifo_wdata; - fifo_err[i] <= fifo_err[i + 1] ? fifo_err[i + 1] : mem_data_err; + if (!rst_n) begin + for (i = 0; i < FIFO_DEPTH; i = i + 1) begin + fifo_valid_hw[i] <= 2'b00; + fifo_mem[i] <= 32'h0; + fifo_err[i] <= 1'b0; + end + end else begin + for (i = 0; i < FIFO_DEPTH; i = i + 1) begin + if (fifo_pop || (fifo_push && !fifo_valid[i])) begin + fifo_mem[i] <= fifo_valid[i + 1] ? fifo_mem[i + 1] : mem_data; + fifo_err[i] <= fifo_valid[i + 1] ? fifo_err[i + 1] : mem_data_err; + end + fifo_valid_hw[i] <= + jump_now ? 2'h0 : + fifo_valid[i + 1] && fifo_pop ? fifo_valid_hw[i + 1] : + fifo_valid[i] && fifo_pop && !fifo_push ? 2'h0 : + fifo_valid[i] && fifo_pop && fifo_push ? mem_data_hwvld : + !fifo_valid[i] && fifo_valid[i - 1] && fifo_push && !fifo_pop ? mem_data_hwvld : fifo_valid_hw[i]; + end + // Allow DM to inject instructions directly into the lowest-numbered queue + // entry. This mux should not extend critical path since it is balanced + // with the instruction-assembly muxes on the queue bypass path. + if (fifo_dbg_inject) begin + fifo_mem[0] <= dbg_instr_data; + fifo_err[0] <= 1'b0; + fifo_valid_hw[0] <= 2'b11; end - end - // Allow DM to inject instructions directly into the lowest-numbered queue - // entry. This mux should not extend critical path since it is balanced - // with the instruction-assembly muxes on the queue bypass path. - if (fifo_dbg_inject) begin - fifo_mem[0] <= dbg_instr_data; - fifo_err[0] <= 1'b0; end end @@ -199,44 +205,26 @@ always @ (posedge clk or negedge rst_n) begin end end -wire unaligned_jump_now = EXTENSION_C && jump_now && jump_target[1]; -reg unaligned_jump_dph; +reg [1:0] mem_aph_hwvld; always @ (posedge clk or negedge rst_n) begin if (!rst_n) begin - unaligned_jump_dph <= 1'b0; + mem_data_hwvld <= 2'b11; + mem_aph_hwvld <= 2'b11; end else if (EXTENSION_C) begin - if ((mem_data_vld && ~|ctr_flush_pending && !cir_lock) - || (jump_now && !unaligned_jump_now)) begin - unaligned_jump_dph <= 1'b0; - end - if (fifo_pop) begin - // Following a lock/unlock of the CIR, we may have an unaligned fetch in - // the FIFO, rather than consuming straight from the bus. - unaligned_jump_dph <= 1'b0; - end - if (unaligned_jump_now) begin - unaligned_jump_dph <= 1'b1; + if (jump_now) begin + if (mem_addr_rdy) begin + mem_data_hwvld <= {1'b1, !jump_target[1]}; + end else begin + mem_aph_hwvld <= {1'b1, !jump_target[1]}; + end + end else if (mem_addr_vld && mem_addr_rdy) begin + mem_data_hwvld <= mem_aph_hwvld; + mem_aph_hwvld <= 2'b11; end end end -`ifdef FORMAL -reg property_after_aligned_jump; -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - property_after_aligned_jump <= 1'b0; - end else begin - property_after_aligned_jump <= jump_now && !jump_target[1]; - if (property_after_aligned_jump) begin - // Make sure this clears properly (have been subtle historic bugs here) - assert(!unaligned_jump_dph); - end - end -end -`endif - - // Combinatorially generate the address-phase request reg reset_holdoff; @@ -296,19 +284,16 @@ assign jump_target_rdy = !mem_addr_hold; reg [1:0] buf_level; reg [W_BUNDLE-1:0] hwbuf; -// You might wonder why we have a 48-bit instruction shifter {hwbuf, cir}. -// What if we had a 32-bit shifter, and tracked halfword-valid status of the -// FIFO entries? This would fail in the following case: -// -// - Initially CIR and FIFO are full -// - Consume a 16-bit instruction from CIR -// - CIR is refilled and last FIFO entry becomes half-valid. -// - Now consume a 32-bit instruction from CIR -// - There is not enough data in the last FIFO entry to refill it - wire [W_DATA-1:0] fetch_data = fifo_empty ? mem_data : fifo_rdata; +wire [1:0] fetch_data_hwvld = fifo_empty ? mem_data_hwvld : fifo_valid_hw[0]; wire fetch_data_vld = !fifo_empty || (mem_data_vld && ~|ctr_flush_pending && !debug_mode); +wire [2*W_BUNDLE-1:0] fetch_data_aligned = { + fetch_data[W_BUNDLE +: W_BUNDLE], + fetch_data_hwvld[0] || ~|EXTENSION_C ? + fetch_data[0 +: W_BUNDLE] : fetch_data[W_BUNDLE +: W_BUNDLE] +}; + // Shift any recycled instruction data down to backfill D's consumption // We don't care about anything which is invalid or will be overlaid with fresh data, // so choose these values in a way that minimises muxes @@ -317,52 +302,68 @@ wire [3*W_BUNDLE-1:0] instr_data_shifted = cir_use[0] && EXTENSION_C ? {hwbuf, hwbuf, cir[W_BUNDLE +: W_BUNDLE]} : {hwbuf, cir}; -// Saturating subtraction: on cir_lock deassertion, -// buf_level will be 0 but cir_use will be positive! -wire [1:0] cir_use_clipped = |buf_level ? cir_use : 2'h0; - -wire [1:0] level_next_no_fetch = buf_level - cir_use_clipped; +wire [1:0] level_next_no_fetch = buf_level - cir_use; // Overlay fresh fetch data onto the shifted/recycled instruction data // Again, if something won't be looked at, generate cheapest possible garbage. -// Don't care if fetch data is valid or not, as will just retry next cycle (as long as flags set correctly) -wire instr_fetch_overlay_blocked = cir_lock || (level_next_no_fetch[1] && !unaligned_jump_dph); +wire instr_fetch_overlay_blocked = level_next_no_fetch[1] && (~|EXTENSION_C || &fetch_data_hwvld); + wire [3*W_BUNDLE-1:0] instr_data_plus_fetch = - instr_fetch_overlay_blocked ? instr_data_shifted : - unaligned_jump_dph && EXTENSION_C ? {instr_data_shifted[W_BUNDLE +: 2*W_BUNDLE], fetch_data[W_BUNDLE +: W_BUNDLE]} : - level_next_no_fetch[0] && EXTENSION_C ? {fetch_data, instr_data_shifted[0 +: W_BUNDLE]} : - {instr_data_shifted[2*W_BUNDLE +: W_BUNDLE], fetch_data}; + instr_fetch_overlay_blocked ? instr_data_shifted : + level_next_no_fetch[1] && |EXTENSION_C ? {fetch_data_aligned[0 +: W_BUNDLE], instr_data_shifted} : + level_next_no_fetch[0] && |EXTENSION_C ? {fetch_data_aligned, instr_data_shifted[0 +: W_BUNDLE]} : + {instr_data_shifted[2 * W_BUNDLE +: W_BUNDLE], fetch_data_aligned}; -assign cir_must_refill = !cir_lock && !level_next_no_fetch[1]; +// Also keep track of bus errors associated with CIR contents, shifted in the +// same way as instruction data. Errors may come straight from the bus, or +// may be buffered in the prefetch queue. + +wire fetch_bus_err = fifo_empty ? mem_data_err : fifo_err[0]; + +reg [2:0] cir_bus_err; +wire [2:0] cir_bus_err_shifted = + cir_use[1] ? cir_bus_err >> 2 : + cir_use[0] && EXTENSION_C ? cir_bus_err >> 1 : cir_bus_err; + +wire [2:0] cir_bus_err_plus_fetch = + instr_fetch_overlay_blocked ? cir_bus_err_shifted : + level_next_no_fetch[1] && |EXTENSION_C ? {fetch_bus_err, cir_bus_err_shifted[1:0]} : + level_next_no_fetch[0] && |EXTENSION_C ? {{2{fetch_bus_err}}, cir_bus_err_shifted[0]} : + {cir_bus_err_shifted[2], {2{fetch_bus_err}}}; + +assign cir_must_refill = !level_next_no_fetch[1]; assign fifo_pop = cir_must_refill && !fifo_empty; +wire [1:0] fetch_fill_amount = cir_must_refill && fetch_data_vld ? ( + &fetch_data_hwvld ? 2'h2 : 2'h1 +) : 2'h0; + wire [1:0] buf_level_next = - jump_now || |ctr_flush_pending || cir_lock ? 2'h0 : - fetch_data_vld && unaligned_jump_dph ? 2'h1 : - buf_level + {cir_must_refill && fetch_data_vld, 1'b0} - cir_use_clipped; + jump_now && cir_flush_behind ? (cir[1:0] == 2'b11 || ~|EXTENSION_C ? 2'h2 : 2'h1) : + jump_now ? 2'h0 : level_next_no_fetch + fetch_fill_amount; always @ (posedge clk or negedge rst_n) begin if (!rst_n) begin buf_level <= 2'h0; cir_vld <= 2'h0; + hwbuf <= 16'h0; + cir <= 16'h0; + cir_bus_err <= 3'h0; end else begin `ifdef FORMAL assert(cir_vld <= 2); assert(cir_use <= cir_vld); + if (!jump_now) + assert(buf_level_next >= level_next_no_fetch); `endif - // Update CIR flags buf_level <= buf_level_next; - if (!cir_lock) - cir_vld <= buf_level_next & ~(buf_level_next >> 1'b1); - // Update CIR contents + cir_vld <= buf_level_next & ~(buf_level_next >> 1'b1); + cir_bus_err <= cir_bus_err_plus_fetch; + {hwbuf, cir} <= instr_data_plus_fetch; end end -// No need to reset these as they will be written before first use -always @ (posedge clk) - {hwbuf, cir} <= instr_data_plus_fetch; - `ifdef FORMAL reg [1:0] property_past_buf_level; // Workaround for weird non-constant $past reset issue always @ (posedge clk or negedge rst_n) begin @@ -377,31 +378,6 @@ always @ (posedge clk or negedge rst_n) begin end `endif -// Also keep track of bus errors associated with CIR contents, shifted in the -// same way as instruction data. Errors may come straight from the bus, or -// may be buffered in the prefetch queue. - -wire fetch_bus_err = fifo_empty ? mem_data_err : fifo_err[0]; - -reg [2:0] cir_bus_err; -wire [2:0] cir_bus_err_shifted = - cir_use[1] ? cir_bus_err >> 2 : - cir_use[0] && EXTENSION_C ? cir_bus_err >> 1 : cir_bus_err; - -wire [2:0] cir_bus_err_plus_fetch = - instr_fetch_overlay_blocked ? cir_bus_err_shifted : - unaligned_jump_dph && EXTENSION_C ? {cir_bus_err_shifted[2:1], fetch_bus_err} : - level_next_no_fetch && EXTENSION_C ? {{2{fetch_bus_err}}, cir_bus_err_shifted[0]} : - {cir_bus_err_shifted[2], {2{fetch_bus_err}}}; - -always @ (posedge clk or negedge rst_n) begin - if (!rst_n) begin - cir_bus_err <= 3'h0; - end else if (CSR_M_TRAP) begin - cir_bus_err <= cir_bus_err_plus_fetch; - end -end - assign cir_err = cir_bus_err[1:0]; // ----------------------------------------------------------------------------