First plausibly working AMOs. Add AMOs to instruction timings list
This commit is contained in:
parent
5c098866f2
commit
df658d86ff
11599
doc/hazard3.pdf
11599
doc/hazard3.pdf
File diff suppressed because it is too large
Load Diff
|
@ -32,7 +32,7 @@ All timings are given assuming perfect bus behaviour (no downstream bus stalls).
|
||||||
| `lui rd, imm` | 1 |
|
| `lui rd, imm` | 1 |
|
||||||
| `auipc rd, imm` | 1 |
|
| `auipc rd, imm` | 1 |
|
||||||
3+| Control Transfer
|
3+| Control Transfer
|
||||||
| `jal rd, label` | 2footnote:unaligned_branch[A branch to a 32-bit instruction which is not 32-bit-aligned requires one additional cycle, because two naturally-aligned bus cycles are required to fetch the target instruction.]|
|
| `jal rd, label` | 2footnote:unaligned_branch[A branch to a 32-bit instruction which is not 32-bit-aligned requires one additional cycle, because two naturally aligned bus cycles are required to fetch the target instruction.]|
|
||||||
| `jalr rd, rs1, imm` | 2footnote:unaligned_branch[] |
|
| `jalr rd, rs1, imm` | 2footnote:unaligned_branch[] |
|
||||||
| `beq rs1, rs2, label`| 1 or 2footnote:unaligned_branch[] | 1 if nontaken, 2 if taken.
|
| `beq rs1, rs2, label`| 1 or 2footnote:unaligned_branch[] | 1 if nontaken, 2 if taken.
|
||||||
| `bne rs1, rs2, label`| 1 or 2footnote:unaligned_branch[] | 1 if nontaken, 2 if taken.
|
| `bne rs1, rs2, label`| 1 or 2footnote:unaligned_branch[] | 1 if nontaken, 2 if taken.
|
||||||
|
@ -78,12 +78,20 @@ Timings assume the core is configured with `MULDIV_UNROLL = 2` and `MUL_FAST = 1
|
||||||
|===
|
|===
|
||||||
| Instruction | Cycles | Note
|
| Instruction | Cycles | Note
|
||||||
3+| Load-Reserved/Store-Conditional
|
3+| Load-Reserved/Store-Conditional
|
||||||
| `lr.w rd, (rs1)` | 1 or 2 | 2 if next instruction is dependentfootnote:data_dependency[], or is an `lr.w`, `sc.w` or `amo*`.footnote:exclusive_pipelining[A pipeline bubble is inserted between `lr.w`/`sc.w` and an immediately-following `lr.w`/`sc.w`/`amo*`, because the AHB5 bus standard does not permit pipelined exclusive accesses. A stall would be inserted between `lr.w` and `sc.w` anyhow, so the local monitor can be updated based on `lr.w` data phase in time to suppress `sc.w` data phase.]
|
| `lr.w rd, (rs1)` | 1 or 2 | 2 if next instruction is dependentfootnote:data_dependency[], or is an `lr.w`, `sc.w` or `amo*.w`.footnote:exclusive_pipelining[A pipeline bubble is inserted between `lr.w`/`sc.w` and an immediately-following `lr.w`/`sc.w`/`amo*`, because the AHB5 bus standard does not permit pipelined exclusive accesses. A stall would be inserted between `lr.w` and `sc.w` anyhow, so the local monitor can be updated based on the `lr.w` data phase in time to suppress the `sc.w` address phase.]
|
||||||
| `sc.w rd, rs2, (rs1)` | 1 or 2 | 2 if next instruction is an `lr.w`, `sc.w` or `amo*`.footnote:exclusive_pipelining[]
|
| `sc.w rd, rs2, (rs1)` | 1 or 2 | 2 if next instruction is an `lr.w`, `sc.w` or `amo*.w`.footnote:exclusive_pipelining[]
|
||||||
|
3+| Atomic Memory Operations
|
||||||
|
|`amoswap.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[AMOs are issued as a paired exclusive read and exclusive write on the bus, at the maximum speed of 2 cycles per access, since the bus does not permit pipelining of exclusive reads/writes. If the write phase fails due to the global monitor reporting a lost reservation, the instruction loops at a rate of 4 cycles per loop, until success. If the read reservation is refused by the global monitor, the instruction generates a Store/AMO Fault exception, to avoid an infinite loop.]
|
||||||
|
|`amoadd.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amoxor.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amoand.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amoor.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amomin.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amomax.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amominu.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|
|`amomaxu.w rd, rs2, (rs1)` | 4+ | 4 per attempt. Multiple attempts if reservation is lost.footnote:amo_timing[]
|
||||||
|===
|
|===
|
||||||
|
|
||||||
AMOs are currently not supported.
|
|
||||||
|
|
||||||
=== C Extension
|
=== C Extension
|
||||||
|
|
||||||
All C extension 16-bit instructions on Hazard3 are aliases of base RV32I instructions. They perform identically to their 32-bit counterparts.
|
All C extension 16-bit instructions on Hazard3 are aliases of base RV32I instructions. They perform identically to their 32-bit counterparts.
|
||||||
|
|
|
@ -403,7 +403,7 @@ hazard3_alu #(
|
||||||
always @ (posedge clk or negedge rst_n) begin
|
always @ (posedge clk or negedge rst_n) begin
|
||||||
if (!rst_n) begin
|
if (!rst_n) begin
|
||||||
x_amo_phase <= 3'h0;
|
x_amo_phase <= 3'h0;
|
||||||
end else if (|EXTENSION_A && !x_stall) begin
|
end else if (|EXTENSION_A && (bus_aph_ready_d || bus_dph_ready_d || m_trap_enter_vld)) begin
|
||||||
if (!d_memop_is_amo) begin
|
if (!d_memop_is_amo) begin
|
||||||
x_amo_phase <= 3'h0;
|
x_amo_phase <= 3'h0;
|
||||||
end else if (x_stall_on_raw) begin
|
end else if (x_stall_on_raw) begin
|
||||||
|
@ -413,7 +413,7 @@ always @ (posedge clk or negedge rst_n) begin
|
||||||
assert(x_amo_phase == 3'h0);
|
assert(x_amo_phase == 3'h0);
|
||||||
`endif
|
`endif
|
||||||
x_amo_phase <= 3'h0;
|
x_amo_phase <= 3'h0;
|
||||||
end else if (m_trap_enter_soon) begin
|
end else if (m_trap_enter_vld) begin
|
||||||
x_amo_phase <= 3'h0;
|
x_amo_phase <= 3'h0;
|
||||||
end else if (x_amo_phase == 3'h1 && !bus_dph_exokay_d) begin
|
end else if (x_amo_phase == 3'h1 && !bus_dph_exokay_d) begin
|
||||||
// Load reserve fail indicates the memory region does not support
|
// Load reserve fail indicates the memory region does not support
|
||||||
|
@ -449,7 +449,11 @@ wire x_unaligned_addr = d_memop != MEMOP_NONE && (
|
||||||
);
|
);
|
||||||
|
|
||||||
// Always query the global monitor, except for store-conditional suppressed by local monitor.
|
// Always query the global monitor, except for store-conditional suppressed by local monitor.
|
||||||
assign bus_aph_excl_d = |EXTENSION_A && (d_memop == MEMOP_LR_W || d_memop == MEMOP_SC_W);
|
assign bus_aph_excl_d = |EXTENSION_A && (
|
||||||
|
d_memop == MEMOP_LR_W ||
|
||||||
|
d_memop == MEMOP_SC_W ||
|
||||||
|
d_memop_is_amo
|
||||||
|
);
|
||||||
|
|
||||||
always @ (*) begin
|
always @ (*) begin
|
||||||
// Need to be careful not to use anything hready-sourced to gate htrans!
|
// Need to be careful not to use anything hready-sourced to gate htrans!
|
||||||
|
@ -681,7 +685,8 @@ always @ (posedge clk or negedge rst_n) begin
|
||||||
if (!m_stall) begin
|
if (!m_stall) begin
|
||||||
{xm_rs1, xm_rs2, xm_rd} <= {d_rs1, d_rs2, d_rd};
|
{xm_rs1, xm_rs2, xm_rd} <= {d_rs1, d_rs2, d_rd};
|
||||||
// If the transfer is unaligned, make sure it is completely NOP'd on the bus
|
// If the transfer is unaligned, make sure it is completely NOP'd on the bus
|
||||||
xm_memop <= d_memop | {x_unaligned_addr, 3'h0};
|
// Likewise, AMOs are handled entirely in X (well it's ambiguous; anyway different logic & stalls)
|
||||||
|
xm_memop <= x_unaligned_addr || d_memop_is_amo ? MEMOP_NONE : d_memop;
|
||||||
xm_except <= x_except;
|
xm_except <= x_except;
|
||||||
xm_wfi <= d_wfi;
|
xm_wfi <= d_wfi;
|
||||||
if (x_stall || m_trap_enter_soon) begin
|
if (x_stall || m_trap_enter_soon) begin
|
||||||
|
@ -706,6 +711,8 @@ always @ (posedge clk or negedge rst_n) begin
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
reg [W_DATA-1:0] amo_load_data;
|
||||||
|
|
||||||
// Datapath flops
|
// Datapath flops
|
||||||
always @ (posedge clk or negedge rst_n) begin
|
always @ (posedge clk or negedge rst_n) begin
|
||||||
if (!rst_n) begin
|
if (!rst_n) begin
|
||||||
|
@ -713,9 +720,10 @@ always @ (posedge clk or negedge rst_n) begin
|
||||||
xm_store_data <= {W_DATA{1'b0}};
|
xm_store_data <= {W_DATA{1'b0}};
|
||||||
end else if (!m_stall) begin
|
end else if (!m_stall) begin
|
||||||
xm_result <=
|
xm_result <=
|
||||||
d_csr_ren ? x_csr_rdata :
|
d_csr_ren ? x_csr_rdata :
|
||||||
EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result :
|
|EXTENSION_A && d_memop_is_amo ? amo_load_data :
|
||||||
x_alu_result;
|
|EXTENSION_M && d_aluop == ALUOP_MULDIV ? x_muldiv_result :
|
||||||
|
x_alu_result;
|
||||||
xm_store_data <= x_rs2_bypass;
|
xm_store_data <= x_rs2_bypass;
|
||||||
|
|
||||||
end else if (d_memop_is_amo && x_amo_phase == 3'h1 && bus_dph_ready_d) begin
|
end else if (d_memop_is_amo && x_amo_phase == 3'h1 && bus_dph_ready_d) begin
|
||||||
|
@ -773,7 +781,6 @@ generate
|
||||||
if (EXTENSION_A) begin: has_amo_alu
|
if (EXTENSION_A) begin: has_amo_alu
|
||||||
|
|
||||||
reg [W_MEMOP-1:0] amo_memop;
|
reg [W_MEMOP-1:0] amo_memop;
|
||||||
reg [W_DATA-1:0] amo_load_data;
|
|
||||||
reg m_amo_wdata_valid_r;
|
reg m_amo_wdata_valid_r;
|
||||||
|
|
||||||
assign m_amo_wdata_valid = m_amo_wdata_valid_r;
|
assign m_amo_wdata_valid = m_amo_wdata_valid_r;
|
||||||
|
@ -805,6 +812,7 @@ end else begin: no_amo_alu
|
||||||
|
|
||||||
assign m_amo_wdata = {W_DATA{1'b0}};
|
assign m_amo_wdata = {W_DATA{1'b0}};
|
||||||
assign m_amo_wdata_valid = 1'b0;
|
assign m_amo_wdata_valid = 1'b0;
|
||||||
|
always @ (*) amo_load_data = {W_DATA{1'b0}};
|
||||||
|
|
||||||
end
|
end
|
||||||
endgenerate
|
endgenerate
|
||||||
|
@ -822,7 +830,7 @@ always @ (*) begin
|
||||||
MEMOP_SB: bus_wdata_d = {4{m_wdata[7:0]}};
|
MEMOP_SB: bus_wdata_d = {4{m_wdata[7:0]}};
|
||||||
default: bus_wdata_d = m_wdata;
|
default: bus_wdata_d = m_wdata;
|
||||||
endcase
|
endcase
|
||||||
if (|EXTENSION_A && amo_wdata_valid)
|
if (|EXTENSION_A && m_amo_wdata_valid)
|
||||||
bus_wdata_d = m_amo_wdata;
|
bus_wdata_d = m_amo_wdata;
|
||||||
|
|
||||||
casez ({xm_memop, xm_result[1:0]})
|
casez ({xm_memop, xm_result[1:0]})
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
SRCS := ../common/init.S main.c
|
||||||
|
APP := amo_smoke
|
||||||
|
CCFLAGS = -march=rv32imac -Os
|
||||||
|
|
||||||
|
include ../common/src_only_app.mk
|
|
@ -0,0 +1,46 @@
|
||||||
|
[*]
|
||||||
|
[*] GTKWave Analyzer v3.3.103 (w)1999-2019 BSI
|
||||||
|
[*] Sat Dec 4 23:25:19 2021
|
||||||
|
[*]
|
||||||
|
[dumpfile] "/home/luke/proj/hazard3/test/sim/amo_smoke/amo_smoke_run.vcd"
|
||||||
|
[dumpfile_mtime] "Sat Dec 4 23:21:36 2021"
|
||||||
|
[dumpfile_size] 6246687
|
||||||
|
[savefile] "/home/luke/proj/hazard3/test/sim/amo_smoke/amo_smoke.gtkw"
|
||||||
|
[timestart] 420
|
||||||
|
[size] 1975 1095
|
||||||
|
[pos] -1 -1
|
||||||
|
*-3.000000 458 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
|
||||||
|
[treeopen] core.
|
||||||
|
[sst_width] 233
|
||||||
|
[signals_width] 246
|
||||||
|
[sst_expanded] 1
|
||||||
|
[sst_vpaned_height] 317
|
||||||
|
@22
|
||||||
|
core.d_pc[31:0]
|
||||||
|
@28
|
||||||
|
core.mw_local_exclusive_reserved
|
||||||
|
core.m_stall
|
||||||
|
core.x_stall
|
||||||
|
core.x_stall_on_amo
|
||||||
|
@200
|
||||||
|
-
|
||||||
|
@22
|
||||||
|
d_haddr[31:0]
|
||||||
|
@28
|
||||||
|
d_htrans[1:0]
|
||||||
|
d_hsize[2:0]
|
||||||
|
@29
|
||||||
|
d_hexcl
|
||||||
|
@28
|
||||||
|
d_hwrite
|
||||||
|
d_hready
|
||||||
|
@22
|
||||||
|
d_hwdata[31:0]
|
||||||
|
d_hrdata[31:0]
|
||||||
|
@200
|
||||||
|
-
|
||||||
|
@22
|
||||||
|
core.x_rs2_bypass[31:0]
|
||||||
|
core.xm_store_data[31:0]
|
||||||
|
[pattern_trace] 1
|
||||||
|
[pattern_trace] 0
|
|
@ -0,0 +1,25 @@
|
||||||
|
#include "tb_cxxrtl_io.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
volatile uint32_t scratch[2];
|
||||||
|
|
||||||
|
#define test_assert(cond, ...) if (!(cond)) {tb_printf(__VA_ARGS__); return -1;}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
scratch[0] = 0;
|
||||||
|
|
||||||
|
tb_puts("Initial value: 0\n");
|
||||||
|
for (int i = 1; i <= 10; ++i) {
|
||||||
|
uint32_t fetched;
|
||||||
|
asm volatile (
|
||||||
|
"amoadd.w %0, %1, (%2)\n"
|
||||||
|
: "=r" (fetched)
|
||||||
|
: "r" (i), "r" (&scratch[0])
|
||||||
|
);
|
||||||
|
tb_printf("amoadd.w rd, %d, (&addr) -> fetched %d\n", i, fetched);
|
||||||
|
}
|
||||||
|
tb_printf("Final value: %d\n", scratch[0]);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue