Save a cycle on popret/popretz by executing the stack adjust after the jump

This commit is contained in:
Luke Wren 2023-03-23 02:50:34 +00:00
parent b074d370a6
commit 4a1d2b5008
6 changed files with 3181 additions and 2855 deletions

File diff suppressed because it is too large Load Diff

View File

@ -182,13 +182,12 @@ They perform identically to their 32-bit counterparts.
| Instruction | Cycles | Note
|`cm.push {rlist}, -imm` | 1 + _n_ | _n_ is number of registers in rlist
|`cm.pop {rlist}, imm` | 1 + _n_ | _n_ is number of registers in rlist
|`cm.popret {rlist}, imm` | 3 + _n_ footnote:unaligned_branch[] | _n_ is number of registers in rlist
|`cm.popretz {rlist}, imm` | 4 + _n_ footnote:unaligned_branch[] | _n_ is number of registers in rlist
|`cm.popret {rlist}, imm` | 4 (_n_ = 1)footnote:popret_stall[The single-register variants of `cm.popret` and `cm.popretz` take the same number of cycles as the two-register variants, because of an internal load-use dependency on the loaded return address.] or 2 + _n_ (_n_ >= 2)footnote:unaligned_branch[] | _n_ is number of registers in rlist
|`cm.popretz {rlist}, imm` | 5 (_n_ = 1)footnote:popret_stall[] or 3 + _n_ (_n_ >= 2)footnote:unaligned_branch[] | _n_ is number of registers in rlist
|`cm.mva01s r1s', r2s'` | 2 |
|`cm.mvsa01 r1s', r2s'` | 2 |
|===
=== Branch Predictor
Hazard3 includes a minimal branch predictor, to accelerate tight loops:

View File

@ -61,11 +61,11 @@ parameter EXTENSION_ZBKB = 0,
// EXTENSION_ZCB: Support for Zcb basic additional compressed instructions
// Requires: EXTENSION_C. (Some Zcb instructions also require Zbb or M.)
// Note Zca is equivalent to C, as we do not support the F extension.
parameter EXTENSION_ZCB = 0,
parameter EXTENSION_ZCB = 1,
// EXTENSION_ZCMP: Support for Zcmp push/pop instructions.
// Requires: EXTENSION_C.
parameter EXTENSION_ZCMP = 0,
parameter EXTENSION_ZCMP = 1,
// EXTENSION_ZIFENCEI: Support for the fence.i instruction
// Optional, since a plain branch/jump will also flush the prefetch queue.

View File

@ -78,7 +78,9 @@ wire d_invalid_16bit;
reg d_invalid_32bit;
wire d_invalid = d_invalid_16bit || d_invalid_32bit;
wire uop_nonfinal;
wire uop_seq_raw;
wire uop_final;
wire uop_no_pc_update;
wire uop_atomic;
wire uop_stall;
wire uop_clear;
@ -86,26 +88,40 @@ wire uop_clear;
hazard3_instr_decompress #(
`include "hazard3_config_inst.vh"
) decomp (
.clk (clk),
.rst_n (rst_n),
.clk (clk),
.rst_n (rst_n),
.instr_in (fd_cir),
.instr_is_32bit (d_instr_is_32bit),
.instr_out (d_instr),
.instr_out_uop_nonfinal (uop_nonfinal),
.instr_out_uop_atomic (uop_atomic),
.instr_out_uop_stall (uop_stall),
.instr_out_uop_clear (uop_clear),
.instr_in (fd_cir),
.instr_is_32bit (d_instr_is_32bit),
.instr_out (d_instr),
.instr_out_is_uop (uop_seq_raw),
.instr_out_is_final_uop (uop_final),
.instr_out_uop_no_pc_update (uop_no_pc_update),
.instr_out_uop_atomic (uop_atomic),
.instr_out_uop_stall (uop_stall),
.instr_out_uop_clear (uop_clear),
.df_uop_step_next (df_uop_step_next),
.df_uop_step_next (df_uop_step_next),
.invalid (d_invalid_16bit)
.invalid (d_invalid_16bit)
);
assign d_uninterruptible = uop_atomic && !d_invalid;
assign d_no_pc_increment = uop_nonfinal && !d_invalid;
wire uop_seq = uop_seq_raw && !d_starved;
wire uop_nonfinal = uop_seq && !uop_final;
assign uop_stall = x_stall || d_starved;
assign uop_clear = f_jump_now;
assign d_uninterruptible = uop_atomic && !d_invalid;
// Signal to null the mepc offset when taking an exception on this
// instruction (because uops in a sequence *which can except*, so excluding
// the final sp adjust on popret/popretz, will all have the same PC as the
// next uop, which will be in stage 2 when they take their exception)
assign d_no_pc_increment = uop_nonfinal;
// Note !df_cir_flush_behind because the jump in cm.popret/popretz is
// the *penultimate* instruction: we execute the stack adjustment in the
// fetch bubble to save a cycle, still need to finish the uop sequence.
assign uop_clear = f_jump_now && !df_cir_flush_behind;
// Decode various immmediate formats
wire [31:0] d_imm_i = {{21{d_instr[31]}}, d_instr[30:20]};
@ -157,8 +173,8 @@ end
reg [W_ADDR-1:0] pc;
wire [W_ADDR-1:0] pc_seq_next = pc + (
|EXTENSION_ZCMP && uop_nonfinal ? 32'h0 :
d_instr_is_32bit ? 32'h4 : 32'h2
|EXTENSION_ZCMP && uop_seq && uop_no_pc_update ? 32'h0 :
d_instr_is_32bit ? 32'h4 : 32'h2
);
assign d_pc = pc;
@ -174,6 +190,14 @@ wire partial_predicted_branch = !d_starved &&
wire predicted_branch = |BRANCH_PREDICTOR && fd_cir_predbranch[0];
// Generally locking takes place on a stalled jump/branch, which may need the
// original PC available to produce a link address when it unstalls. An
// exception to this is jumps in micro-op sequences: in this case the jump is
// the penultimate instruction in the sequence (ret before addi sp) and we
// need to capture the pc mid-uop-sequence.
wire hold_pc_on_cir_lock = assert_cir_lock && !(uop_seq && !uop_no_pc_update);
wire update_pc_on_cir_unlock = cir_lock_prev && deassert_cir_lock && !(uop_seq && uop_no_pc_update);
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
pc <= RESET_VECTOR;
@ -182,7 +206,7 @@ always @ (posedge clk or negedge rst_n) begin
pc <= debug_dpc_wdata;
end else if (debug_mode) begin
pc <= pc;
end else if ((f_jump_now && !assert_cir_lock) || (cir_lock_prev && deassert_cir_lock)) begin
end else if ((f_jump_now && !hold_pc_on_cir_lock) || update_pc_on_cir_unlock) begin
pc <= f_jump_target;
end else if (!d_stall && !cir_lock) begin
// If this instruction is a predicted-taken branch (and has not

View File

@ -559,8 +559,8 @@ wire [4:0] zcmp_pushpop_rs2 =
wire [4:0] zcmp_pushpop_rs1 =
uop_ctr < 4'hd ? 5'd02 : // sp (addr base reg)
uop_ctr == 4'hd ? 5'd00 : // zero (clear a0)
uop_ctr == 4'he ? 5'd02 : // sp (stack adj)
5'd01 ; // ra (ret)
uop_ctr == 4'he ? 5'd01 : // ra (ret)
5'd02 ; // sp (stack adj)
wire [4:0] zcmp_sa01_r1s = {|next_instr[9:8], ~|next_instr[9:8], next_instr[9:7]};
wire [4:0] zcmp_sa01_r2s = {|next_instr[4:3], ~|next_instr[4:3], next_instr[4:2]};

View File

@ -17,10 +17,11 @@ module hazard3_instr_decompress #(
output reg instr_is_32bit,
output reg [31:0] instr_out,
// Indicate instr_out is a uop, and more uops follow in this sequence.
// Should suppress PC update, and null the PC offset in the mepc address
// in stage 3.
output wire instr_out_uop_nonfinal,
// If instruction is a non-final uop, need to suppress PC update, and null
// the PC offset in the mepc address in stage 3.
output wire instr_out_is_uop,
output wire instr_out_is_final_uop,
output wire instr_out_uop_no_pc_update,
// Indicate instr_out is a uop from the noninterruptible part of a uop
// sequence. If one uop is noninterruptible, all following uops until the
// end of the sequence are also noninterruptible.
@ -107,17 +108,20 @@ function [31:0] rfmt_rs2; input [4:0] rs2; begin rfmt_rs2 = {7'h00, rs2, 20'h000
//
// - 13x lw (counter = 0..12)
// - 1x addi to set a0 to zero (counter = 13 ) < atomic section
// - 1x addi to adjust sp (counter = 14 ) < atomic section
// - 1x jalr to jump through ra (counter = 15 ) < atomic section
// - 1x jalr to jump through ra (counter = 14 ) < atomic section
// - 1x addi to adjust sp (counter = 15 ) < atomic section
reg [3:0] uop_ctr;
reg [3:0] uop_ctr_nxt;
reg in_uop_seq;
reg uop_seq_end;
reg uop_atomic;
reg uop_no_pc_update;
assign instr_out_uop_nonfinal = in_uop_seq && !uop_seq_end;
assign instr_out_is_uop = in_uop_seq;
assign instr_out_is_final_uop = uop_seq_end;
assign instr_out_uop_atomic = uop_atomic;
assign instr_out_uop_no_pc_update = uop_no_pc_update;
assign df_uop_step_next = uop_ctr_nxt;
// The offset from current sp value to the lowest-addressed saved register, +64.
@ -188,6 +192,7 @@ end else begin: instr_decompress
uop_seq_end = 1'b0;
in_uop_seq = 1'b0;
uop_atomic = 1'b0;
uop_no_pc_update = 1'b0;
uop_ctr_nxt = uop_ctr;
casez (instr_in[15:0])
16'h0: invalid = 1'b1;
@ -295,7 +300,7 @@ end else begin: instr_decompress
// Optional Zcmp instructions:
`RVOPC_CM_PUSH: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin
invalid = 1'b1;
end else if (uop_ctr == 4'he) begin
end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1;
uop_seq_end = 1'b1;
uop_ctr_nxt = 4'h0;
@ -304,14 +309,15 @@ end else begin: instr_decompress
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_push_sw_instr;
uop_no_pc_update = 1'b1;
if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'he;
uop_ctr_nxt = 4'hf;
end
end
`RVOPC_CM_POP: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin
invalid = 1'b1;
end else if (uop_ctr == 4'he) begin
end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1;
uop_seq_end = 1'b1;
uop_ctr_nxt = 4'h0;
@ -320,9 +326,10 @@ end else begin: instr_decompress
end else begin
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_lw_instr;
if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'he;
uop_ctr_nxt = 4'hf;
end
end
@ -334,17 +341,19 @@ end else begin: instr_decompress
// executes, they all execute. Having none execute is fine.
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_pop_stack_adj_instr;
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1;
uop_seq_end = 1'b1;
uop_atomic = 1'b1;
uop_ctr_nxt = 4'h0;
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_stack_adj_instr;
end else begin
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_pop_lw_instr;
uop_no_pc_update = 1'b1;
if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'he;
end
@ -355,21 +364,24 @@ end else begin: instr_decompress
end else if (uop_ctr == 4'hd) begin
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10); // li a0, 0
end else if (uop_ctr == 4'he) begin
in_uop_seq = 1'b1;
uop_atomic = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_pop_stack_adj_instr;
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1;
uop_seq_end = 1'b1;
uop_atomic = 1'b1;
uop_ctr_nxt = 4'h0;
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_stack_adj_instr;
end else begin
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_lw_instr;
if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'hd;
@ -381,6 +393,7 @@ end else begin: instr_decompress
end else if (uop_ctr == 4'h0) begin
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(zcmp_sa01_r1s) | rfmt_rs1(5'd10);
end else begin
in_uop_seq = 1'b1;
@ -395,6 +408,7 @@ end else begin: instr_decompress
end else if (uop_ctr == 4'h0) begin
in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10) | rfmt_rs1(zcmp_sa01_r1s);
end else begin
in_uop_seq = 1'b1;