Save a cycle on popret/popretz by executing the stack adjust after the jump

This commit is contained in:
Luke Wren 2023-03-23 02:50:34 +00:00
parent b074d370a6
commit 4a1d2b5008
6 changed files with 3181 additions and 2855 deletions

File diff suppressed because it is too large Load Diff

View File

@ -182,13 +182,12 @@ They perform identically to their 32-bit counterparts.
| Instruction | Cycles | Note | Instruction | Cycles | Note
|`cm.push {rlist}, -imm` | 1 + _n_ | _n_ is number of registers in rlist |`cm.push {rlist}, -imm` | 1 + _n_ | _n_ is number of registers in rlist
|`cm.pop {rlist}, imm` | 1 + _n_ | _n_ is number of registers in rlist |`cm.pop {rlist}, imm` | 1 + _n_ | _n_ is number of registers in rlist
|`cm.popret {rlist}, imm` | 3 + _n_ footnote:unaligned_branch[] | _n_ is number of registers in rlist |`cm.popret {rlist}, imm` | 4 (_n_ = 1)footnote:popret_stall[The single-register variants of `cm.popret` and `cm.popretz` take the same number of cycles as the two-register variants, because of an internal load-use dependency on the loaded return address.] or 2 + _n_ (_n_ >= 2)footnote:unaligned_branch[] | _n_ is number of registers in rlist
|`cm.popretz {rlist}, imm` | 4 + _n_ footnote:unaligned_branch[] | _n_ is number of registers in rlist |`cm.popretz {rlist}, imm` | 5 (_n_ = 1)footnote:popret_stall[] or 3 + _n_ (_n_ >= 2)footnote:unaligned_branch[] | _n_ is number of registers in rlist
|`cm.mva01s r1s', r2s'` | 2 | |`cm.mva01s r1s', r2s'` | 2 |
|`cm.mvsa01 r1s', r2s'` | 2 | |`cm.mvsa01 r1s', r2s'` | 2 |
|=== |===
=== Branch Predictor === Branch Predictor
Hazard3 includes a minimal branch predictor, to accelerate tight loops: Hazard3 includes a minimal branch predictor, to accelerate tight loops:

View File

@ -61,11 +61,11 @@ parameter EXTENSION_ZBKB = 0,
// EXTENSION_ZCB: Support for Zcb basic additional compressed instructions // EXTENSION_ZCB: Support for Zcb basic additional compressed instructions
// Requires: EXTENSION_C. (Some Zcb instructions also require Zbb or M.) // Requires: EXTENSION_C. (Some Zcb instructions also require Zbb or M.)
// Note Zca is equivalent to C, as we do not support the F extension. // Note Zca is equivalent to C, as we do not support the F extension.
parameter EXTENSION_ZCB = 0, parameter EXTENSION_ZCB = 1,
// EXTENSION_ZCMP: Support for Zcmp push/pop instructions. // EXTENSION_ZCMP: Support for Zcmp push/pop instructions.
// Requires: EXTENSION_C. // Requires: EXTENSION_C.
parameter EXTENSION_ZCMP = 0, parameter EXTENSION_ZCMP = 1,
// EXTENSION_ZIFENCEI: Support for the fence.i instruction // EXTENSION_ZIFENCEI: Support for the fence.i instruction
// Optional, since a plain branch/jump will also flush the prefetch queue. // Optional, since a plain branch/jump will also flush the prefetch queue.

View File

@ -78,7 +78,9 @@ wire d_invalid_16bit;
reg d_invalid_32bit; reg d_invalid_32bit;
wire d_invalid = d_invalid_16bit || d_invalid_32bit; wire d_invalid = d_invalid_16bit || d_invalid_32bit;
wire uop_nonfinal; wire uop_seq_raw;
wire uop_final;
wire uop_no_pc_update;
wire uop_atomic; wire uop_atomic;
wire uop_stall; wire uop_stall;
wire uop_clear; wire uop_clear;
@ -92,7 +94,9 @@ hazard3_instr_decompress #(
.instr_in (fd_cir), .instr_in (fd_cir),
.instr_is_32bit (d_instr_is_32bit), .instr_is_32bit (d_instr_is_32bit),
.instr_out (d_instr), .instr_out (d_instr),
.instr_out_uop_nonfinal (uop_nonfinal), .instr_out_is_uop (uop_seq_raw),
.instr_out_is_final_uop (uop_final),
.instr_out_uop_no_pc_update (uop_no_pc_update),
.instr_out_uop_atomic (uop_atomic), .instr_out_uop_atomic (uop_atomic),
.instr_out_uop_stall (uop_stall), .instr_out_uop_stall (uop_stall),
.instr_out_uop_clear (uop_clear), .instr_out_uop_clear (uop_clear),
@ -102,10 +106,22 @@ hazard3_instr_decompress #(
.invalid (d_invalid_16bit) .invalid (d_invalid_16bit)
); );
assign d_uninterruptible = uop_atomic && !d_invalid; wire uop_seq = uop_seq_raw && !d_starved;
assign d_no_pc_increment = uop_nonfinal && !d_invalid; wire uop_nonfinal = uop_seq && !uop_final;
assign uop_stall = x_stall || d_starved; assign uop_stall = x_stall || d_starved;
assign uop_clear = f_jump_now;
assign d_uninterruptible = uop_atomic && !d_invalid;
// Signal to null the mepc offset when taking an exception on this
// instruction (because uops in a sequence *which can except*, so excluding
// the final sp adjust on popret/popretz, will all have the same PC as the
// next uop, which will be in stage 2 when they take their exception)
assign d_no_pc_increment = uop_nonfinal;
// Note !df_cir_flush_behind because the jump in cm.popret/popretz is
// the *penultimate* instruction: we execute the stack adjustment in the
// fetch bubble to save a cycle, still need to finish the uop sequence.
assign uop_clear = f_jump_now && !df_cir_flush_behind;
// Decode various immmediate formats // Decode various immmediate formats
wire [31:0] d_imm_i = {{21{d_instr[31]}}, d_instr[30:20]}; wire [31:0] d_imm_i = {{21{d_instr[31]}}, d_instr[30:20]};
@ -157,7 +173,7 @@ end
reg [W_ADDR-1:0] pc; reg [W_ADDR-1:0] pc;
wire [W_ADDR-1:0] pc_seq_next = pc + ( wire [W_ADDR-1:0] pc_seq_next = pc + (
|EXTENSION_ZCMP && uop_nonfinal ? 32'h0 : |EXTENSION_ZCMP && uop_seq && uop_no_pc_update ? 32'h0 :
d_instr_is_32bit ? 32'h4 : 32'h2 d_instr_is_32bit ? 32'h4 : 32'h2
); );
@ -174,6 +190,14 @@ wire partial_predicted_branch = !d_starved &&
wire predicted_branch = |BRANCH_PREDICTOR && fd_cir_predbranch[0]; wire predicted_branch = |BRANCH_PREDICTOR && fd_cir_predbranch[0];
// Generally locking takes place on a stalled jump/branch, which may need the
// original PC available to produce a link address when it unstalls. An
// exception to this is jumps in micro-op sequences: in this case the jump is
// the penultimate instruction in the sequence (ret before addi sp) and we
// need to capture the pc mid-uop-sequence.
wire hold_pc_on_cir_lock = assert_cir_lock && !(uop_seq && !uop_no_pc_update);
wire update_pc_on_cir_unlock = cir_lock_prev && deassert_cir_lock && !(uop_seq && uop_no_pc_update);
always @ (posedge clk or negedge rst_n) begin always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin if (!rst_n) begin
pc <= RESET_VECTOR; pc <= RESET_VECTOR;
@ -182,7 +206,7 @@ always @ (posedge clk or negedge rst_n) begin
pc <= debug_dpc_wdata; pc <= debug_dpc_wdata;
end else if (debug_mode) begin end else if (debug_mode) begin
pc <= pc; pc <= pc;
end else if ((f_jump_now && !assert_cir_lock) || (cir_lock_prev && deassert_cir_lock)) begin end else if ((f_jump_now && !hold_pc_on_cir_lock) || update_pc_on_cir_unlock) begin
pc <= f_jump_target; pc <= f_jump_target;
end else if (!d_stall && !cir_lock) begin end else if (!d_stall && !cir_lock) begin
// If this instruction is a predicted-taken branch (and has not // If this instruction is a predicted-taken branch (and has not

View File

@ -559,8 +559,8 @@ wire [4:0] zcmp_pushpop_rs2 =
wire [4:0] zcmp_pushpop_rs1 = wire [4:0] zcmp_pushpop_rs1 =
uop_ctr < 4'hd ? 5'd02 : // sp (addr base reg) uop_ctr < 4'hd ? 5'd02 : // sp (addr base reg)
uop_ctr == 4'hd ? 5'd00 : // zero (clear a0) uop_ctr == 4'hd ? 5'd00 : // zero (clear a0)
uop_ctr == 4'he ? 5'd02 : // sp (stack adj) uop_ctr == 4'he ? 5'd01 : // ra (ret)
5'd01 ; // ra (ret) 5'd02 ; // sp (stack adj)
wire [4:0] zcmp_sa01_r1s = {|next_instr[9:8], ~|next_instr[9:8], next_instr[9:7]}; wire [4:0] zcmp_sa01_r1s = {|next_instr[9:8], ~|next_instr[9:8], next_instr[9:7]};
wire [4:0] zcmp_sa01_r2s = {|next_instr[4:3], ~|next_instr[4:3], next_instr[4:2]}; wire [4:0] zcmp_sa01_r2s = {|next_instr[4:3], ~|next_instr[4:3], next_instr[4:2]};

View File

@ -17,10 +17,11 @@ module hazard3_instr_decompress #(
output reg instr_is_32bit, output reg instr_is_32bit,
output reg [31:0] instr_out, output reg [31:0] instr_out,
// Indicate instr_out is a uop, and more uops follow in this sequence. // If instruction is a non-final uop, need to suppress PC update, and null
// Should suppress PC update, and null the PC offset in the mepc address // the PC offset in the mepc address in stage 3.
// in stage 3. output wire instr_out_is_uop,
output wire instr_out_uop_nonfinal, output wire instr_out_is_final_uop,
output wire instr_out_uop_no_pc_update,
// Indicate instr_out is a uop from the noninterruptible part of a uop // Indicate instr_out is a uop from the noninterruptible part of a uop
// sequence. If one uop is noninterruptible, all following uops until the // sequence. If one uop is noninterruptible, all following uops until the
// end of the sequence are also noninterruptible. // end of the sequence are also noninterruptible.
@ -107,17 +108,20 @@ function [31:0] rfmt_rs2; input [4:0] rs2; begin rfmt_rs2 = {7'h00, rs2, 20'h000
// //
// - 13x lw (counter = 0..12) // - 13x lw (counter = 0..12)
// - 1x addi to set a0 to zero (counter = 13 ) < atomic section // - 1x addi to set a0 to zero (counter = 13 ) < atomic section
// - 1x addi to adjust sp (counter = 14 ) < atomic section // - 1x jalr to jump through ra (counter = 14 ) < atomic section
// - 1x jalr to jump through ra (counter = 15 ) < atomic section // - 1x addi to adjust sp (counter = 15 ) < atomic section
reg [3:0] uop_ctr; reg [3:0] uop_ctr;
reg [3:0] uop_ctr_nxt; reg [3:0] uop_ctr_nxt;
reg in_uop_seq; reg in_uop_seq;
reg uop_seq_end; reg uop_seq_end;
reg uop_atomic; reg uop_atomic;
reg uop_no_pc_update;
assign instr_out_uop_nonfinal = in_uop_seq && !uop_seq_end; assign instr_out_is_uop = in_uop_seq;
assign instr_out_is_final_uop = uop_seq_end;
assign instr_out_uop_atomic = uop_atomic; assign instr_out_uop_atomic = uop_atomic;
assign instr_out_uop_no_pc_update = uop_no_pc_update;
assign df_uop_step_next = uop_ctr_nxt; assign df_uop_step_next = uop_ctr_nxt;
// The offset from current sp value to the lowest-addressed saved register, +64. // The offset from current sp value to the lowest-addressed saved register, +64.
@ -188,6 +192,7 @@ end else begin: instr_decompress
uop_seq_end = 1'b0; uop_seq_end = 1'b0;
in_uop_seq = 1'b0; in_uop_seq = 1'b0;
uop_atomic = 1'b0; uop_atomic = 1'b0;
uop_no_pc_update = 1'b0;
uop_ctr_nxt = uop_ctr; uop_ctr_nxt = uop_ctr;
casez (instr_in[15:0]) casez (instr_in[15:0])
16'h0: invalid = 1'b1; 16'h0: invalid = 1'b1;
@ -295,7 +300,7 @@ end else begin: instr_decompress
// Optional Zcmp instructions: // Optional Zcmp instructions:
`RVOPC_CM_PUSH: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin `RVOPC_CM_PUSH: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin
invalid = 1'b1; invalid = 1'b1;
end else if (uop_ctr == 4'he) begin end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_seq_end = 1'b1; uop_seq_end = 1'b1;
uop_ctr_nxt = 4'h0; uop_ctr_nxt = 4'h0;
@ -304,14 +309,15 @@ end else begin: instr_decompress
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_push_sw_instr; instr_out = zcmp_push_sw_instr;
uop_no_pc_update = 1'b1;
if (uop_ctr_nxt == zcmp_n_regs) begin if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'he; uop_ctr_nxt = 4'hf;
end end
end end
`RVOPC_CM_POP: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin `RVOPC_CM_POP: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin
invalid = 1'b1; invalid = 1'b1;
end else if (uop_ctr == 4'he) begin end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_seq_end = 1'b1; uop_seq_end = 1'b1;
uop_ctr_nxt = 4'h0; uop_ctr_nxt = 4'h0;
@ -320,9 +326,10 @@ end else begin: instr_decompress
end else begin end else begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_lw_instr; instr_out = zcmp_pop_lw_instr;
if (uop_ctr_nxt == zcmp_n_regs) begin if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'he; uop_ctr_nxt = 4'hf;
end end
end end
@ -334,17 +341,19 @@ end else begin: instr_decompress
// executes, they all execute. Having none execute is fine. // executes, they all execute. Having none execute is fine.
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_pop_stack_adj_instr; instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
end else if (uop_ctr == 4'hf) begin end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_seq_end = 1'b1; uop_seq_end = 1'b1;
uop_atomic = 1'b1; uop_atomic = 1'b1;
uop_ctr_nxt = 4'h0; uop_ctr_nxt = 4'h0;
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1); uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_stack_adj_instr;
end else begin end else begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_pop_lw_instr; instr_out = zcmp_pop_lw_instr;
uop_no_pc_update = 1'b1;
if (uop_ctr_nxt == zcmp_n_regs) begin if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'he; uop_ctr_nxt = 4'he;
end end
@ -355,21 +364,24 @@ end else begin: instr_decompress
end else if (uop_ctr == 4'hd) begin end else if (uop_ctr == 4'hd) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10); // li a0, 0 instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10); // li a0, 0
end else if (uop_ctr == 4'he) begin end else if (uop_ctr == 4'he) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_atomic = 1'b1; uop_atomic = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
instr_out = zcmp_pop_stack_adj_instr; instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
end else if (uop_ctr == 4'hf) begin end else if (uop_ctr == 4'hf) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_seq_end = 1'b1; uop_seq_end = 1'b1;
uop_atomic = 1'b1; uop_atomic = 1'b1;
uop_ctr_nxt = 4'h0; uop_ctr_nxt = 4'h0;
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1); uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_stack_adj_instr;
end else begin end else begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = zcmp_pop_lw_instr; instr_out = zcmp_pop_lw_instr;
if (uop_ctr_nxt == zcmp_n_regs) begin if (uop_ctr_nxt == zcmp_n_regs) begin
uop_ctr_nxt = 4'hd; uop_ctr_nxt = 4'hd;
@ -381,6 +393,7 @@ end else begin: instr_decompress
end else if (uop_ctr == 4'h0) begin end else if (uop_ctr == 4'h0) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(zcmp_sa01_r1s) | rfmt_rs1(5'd10); instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(zcmp_sa01_r1s) | rfmt_rs1(5'd10);
end else begin end else begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
@ -395,6 +408,7 @@ end else begin: instr_decompress
end else if (uop_ctr == 4'h0) begin end else if (uop_ctr == 4'h0) begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;
uop_ctr_nxt = uop_ctr + 4'h1; uop_ctr_nxt = uop_ctr + 4'h1;
uop_no_pc_update = 1'b1;
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10) | rfmt_rs1(zcmp_sa01_r1s); instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10) | rfmt_rs1(zcmp_sa01_r1s);
end else begin end else begin
in_uop_seq = 1'b1; in_uop_seq = 1'b1;