Fix illegal issue of pipelined exclusives on the bus, and document correct timings

This commit is contained in:
Luke Wren 2021-12-04 18:23:01 +00:00
parent 5e17bb805e
commit a8933c332d
3 changed files with 2516 additions and 2083 deletions

File diff suppressed because it is too large Load Diff

View File

@ -77,8 +77,9 @@ Timings assume the core is configured with `MULDIV_UNROLL = 2` and `MUL_FAST = 1
[%autowidth.stretch, options="header"]
|===
| Instruction | Cycles | Note
| `lr.w rd, (rs1)` | 1 or 2 | 1 if next instruction is independent, 2 if dependent.footnote:data_dependency[]
| `sc.w rd, rs2, (rs1)` | 1 | `lr.w` followed by `sc.w` always inserts a dependency stall.footnote:lr_to_sc[A 1-cycle pipeline bubble is inserted in between an `lr.w` and an immediately-following `sc.w`, so that the store can be suppressed by a reservation failure on the load. It does not matter whether the `lr.w` and `sc.w` use the same registers. Load reservation may fail if the memory region does not support exclusive transfers.]
3+| Load-Reserved/Store-Conditional
| `lr.w rd, (rs1)` | 1 or 2 | 2 if next instruction is dependentfootnote:data_dependency[], or is an `lr.w`, `sc.w` or `amo*`.footnote:exclusive_pipelining[A pipeline bubble is inserted between `lr.w`/`sc.w` and an immediately-following `lr.w`/`sc.w`/`amo*`, because the AHB5 bus standard does not permit pipelined exclusive accesses. A stall would be inserted between `lr.w` and `sc.w` anyhow, so the local monitor can be updated based on `lr.w` data phase in time to suppress `sc.w` data phase.]
| `sc.w rd, rs2, (rs1)` | 1 or 2 | 2 if next instruction is an `lr.w`, `sc.w` or `amo*`.footnote:exclusive_pipelining[]
|===
AMOs are currently not supported.

View File

@ -285,9 +285,24 @@ wire m_wfi_stall_clear;
wire x_stall_on_trap = m_trap_enter_vld && !m_trap_enter_rdy ||
m_trap_enter_soon && !m_trap_enter_vld;
// Stall inserted to avoid illegal pipelining of exclusive accesses on the bus
// (also gives time to update local monitor on direct lr.w -> sc.w instruction
// sequences). Note we don't check for AMOs in stage M, because AMOs fully
// fence off on their own completion before passing down the pipe.
wire d_memop_is_amo = |EXTENSION_A && (
d_memop >= MEMOP_AMOSWAP_W && d_memop <= MEMOP_AMOMAXU_W
);
wire x_stall_on_exclusive_overlap = |EXTENSION_A && (
(d_memop_is_amo || d_memop == MEMOP_SC_W || d_memop == MEMOP_LR_W) &&
(xm_memop == MEMOP_SC_W || xm_memop == MEMOP_LR_W)
);
assign x_stall =
m_stall ||
x_stall_on_trap ||
x_stall_on_exclusive_overlap ||
x_stall_raw || x_stall_muldiv ||
bus_aph_req_d && !bus_aph_ready_d ||
x_jump_req && !f_jump_rdy;
@ -296,15 +311,13 @@ wire m_fast_mul_result_vld;
wire m_generating_result = xm_memop < MEMOP_SW || m_fast_mul_result_vld;
// Load-use hazard detection
always @ (*) begin
x_stall_raw = 1'b0;
if (REDUCED_BYPASS) begin
x_stall_raw =
|xm_rd && (xm_rd == d_rs1 || xm_rd == d_rs2) ||
|mw_rd && (mw_rd == d_rs1 || mw_rd == d_rs2);
end else if (|EXTENSION_A && xm_memop == MEMOP_LR_W && d_memop == MEMOP_SC_W) begin
// Conditional-store address phase depends on data-phase update of local monitor bit
x_stall_raw = 1'b1;
end else if (m_generating_result) begin
// With the full bypass network, load-use (or fast multiply-use) is the only RAW stall
if (|xm_rd && xm_rd == d_rs1) begin
@ -398,6 +411,7 @@ always @ (*) begin
endcase
bus_aph_req_d = x_memop_vld && !(
x_stall_raw ||
x_stall_on_exclusive_overlap ||
x_unaligned_addr ||
m_trap_enter_soon ||
(xm_wfi && !m_wfi_stall_clear) // FIXME will cause a timing issue, better to stall til *after* clear