Save a cycle on popret/popretz by executing the stack adjust after the jump
This commit is contained in:
parent
b074d370a6
commit
4a1d2b5008
5919
doc/hazard3.pdf
5919
doc/hazard3.pdf
File diff suppressed because it is too large
Load Diff
|
@ -182,13 +182,12 @@ They perform identically to their 32-bit counterparts.
|
|||
| Instruction | Cycles | Note
|
||||
|`cm.push {rlist}, -imm` | 1 + _n_ | _n_ is number of registers in rlist
|
||||
|`cm.pop {rlist}, imm` | 1 + _n_ | _n_ is number of registers in rlist
|
||||
|`cm.popret {rlist}, imm` | 3 + _n_ footnote:unaligned_branch[] | _n_ is number of registers in rlist
|
||||
|`cm.popretz {rlist}, imm` | 4 + _n_ footnote:unaligned_branch[] | _n_ is number of registers in rlist
|
||||
|`cm.popret {rlist}, imm` | 4 (_n_ = 1)footnote:popret_stall[The single-register variants of `cm.popret` and `cm.popretz` take the same number of cycles as the two-register variants, because of an internal load-use dependency on the loaded return address.] or 2 + _n_ (_n_ >= 2)footnote:unaligned_branch[] | _n_ is number of registers in rlist
|
||||
|`cm.popretz {rlist}, imm` | 5 (_n_ = 1)footnote:popret_stall[] or 3 + _n_ (_n_ >= 2)footnote:unaligned_branch[] | _n_ is number of registers in rlist
|
||||
|`cm.mva01s r1s', r2s'` | 2 |
|
||||
|`cm.mvsa01 r1s', r2s'` | 2 |
|
||||
|===
|
||||
|
||||
|
||||
=== Branch Predictor
|
||||
|
||||
Hazard3 includes a minimal branch predictor, to accelerate tight loops:
|
||||
|
|
|
@ -61,11 +61,11 @@ parameter EXTENSION_ZBKB = 0,
|
|||
// EXTENSION_ZCB: Support for Zcb basic additional compressed instructions
|
||||
// Requires: EXTENSION_C. (Some Zcb instructions also require Zbb or M.)
|
||||
// Note Zca is equivalent to C, as we do not support the F extension.
|
||||
parameter EXTENSION_ZCB = 0,
|
||||
parameter EXTENSION_ZCB = 1,
|
||||
|
||||
// EXTENSION_ZCMP: Support for Zcmp push/pop instructions.
|
||||
// Requires: EXTENSION_C.
|
||||
parameter EXTENSION_ZCMP = 0,
|
||||
parameter EXTENSION_ZCMP = 1,
|
||||
|
||||
// EXTENSION_ZIFENCEI: Support for the fence.i instruction
|
||||
// Optional, since a plain branch/jump will also flush the prefetch queue.
|
||||
|
|
|
@ -78,7 +78,9 @@ wire d_invalid_16bit;
|
|||
reg d_invalid_32bit;
|
||||
wire d_invalid = d_invalid_16bit || d_invalid_32bit;
|
||||
|
||||
wire uop_nonfinal;
|
||||
wire uop_seq_raw;
|
||||
wire uop_final;
|
||||
wire uop_no_pc_update;
|
||||
wire uop_atomic;
|
||||
wire uop_stall;
|
||||
wire uop_clear;
|
||||
|
@ -86,26 +88,40 @@ wire uop_clear;
|
|||
hazard3_instr_decompress #(
|
||||
`include "hazard3_config_inst.vh"
|
||||
) decomp (
|
||||
.clk (clk),
|
||||
.rst_n (rst_n),
|
||||
.clk (clk),
|
||||
.rst_n (rst_n),
|
||||
|
||||
.instr_in (fd_cir),
|
||||
.instr_is_32bit (d_instr_is_32bit),
|
||||
.instr_out (d_instr),
|
||||
.instr_out_uop_nonfinal (uop_nonfinal),
|
||||
.instr_out_uop_atomic (uop_atomic),
|
||||
.instr_out_uop_stall (uop_stall),
|
||||
.instr_out_uop_clear (uop_clear),
|
||||
.instr_in (fd_cir),
|
||||
.instr_is_32bit (d_instr_is_32bit),
|
||||
.instr_out (d_instr),
|
||||
.instr_out_is_uop (uop_seq_raw),
|
||||
.instr_out_is_final_uop (uop_final),
|
||||
.instr_out_uop_no_pc_update (uop_no_pc_update),
|
||||
.instr_out_uop_atomic (uop_atomic),
|
||||
.instr_out_uop_stall (uop_stall),
|
||||
.instr_out_uop_clear (uop_clear),
|
||||
|
||||
.df_uop_step_next (df_uop_step_next),
|
||||
.df_uop_step_next (df_uop_step_next),
|
||||
|
||||
.invalid (d_invalid_16bit)
|
||||
.invalid (d_invalid_16bit)
|
||||
);
|
||||
|
||||
assign d_uninterruptible = uop_atomic && !d_invalid;
|
||||
assign d_no_pc_increment = uop_nonfinal && !d_invalid;
|
||||
wire uop_seq = uop_seq_raw && !d_starved;
|
||||
wire uop_nonfinal = uop_seq && !uop_final;
|
||||
assign uop_stall = x_stall || d_starved;
|
||||
assign uop_clear = f_jump_now;
|
||||
|
||||
assign d_uninterruptible = uop_atomic && !d_invalid;
|
||||
|
||||
// Signal to null the mepc offset when taking an exception on this
|
||||
// instruction (because uops in a sequence *which can except*, so excluding
|
||||
// the final sp adjust on popret/popretz, will all have the same PC as the
|
||||
// next uop, which will be in stage 2 when they take their exception)
|
||||
assign d_no_pc_increment = uop_nonfinal;
|
||||
|
||||
// Note !df_cir_flush_behind because the jump in cm.popret/popretz is
|
||||
// the *penultimate* instruction: we execute the stack adjustment in the
|
||||
// fetch bubble to save a cycle, still need to finish the uop sequence.
|
||||
assign uop_clear = f_jump_now && !df_cir_flush_behind;
|
||||
|
||||
// Decode various immmediate formats
|
||||
wire [31:0] d_imm_i = {{21{d_instr[31]}}, d_instr[30:20]};
|
||||
|
@ -157,8 +173,8 @@ end
|
|||
|
||||
reg [W_ADDR-1:0] pc;
|
||||
wire [W_ADDR-1:0] pc_seq_next = pc + (
|
||||
|EXTENSION_ZCMP && uop_nonfinal ? 32'h0 :
|
||||
d_instr_is_32bit ? 32'h4 : 32'h2
|
||||
|EXTENSION_ZCMP && uop_seq && uop_no_pc_update ? 32'h0 :
|
||||
d_instr_is_32bit ? 32'h4 : 32'h2
|
||||
);
|
||||
|
||||
assign d_pc = pc;
|
||||
|
@ -174,6 +190,14 @@ wire partial_predicted_branch = !d_starved &&
|
|||
|
||||
wire predicted_branch = |BRANCH_PREDICTOR && fd_cir_predbranch[0];
|
||||
|
||||
// Generally locking takes place on a stalled jump/branch, which may need the
|
||||
// original PC available to produce a link address when it unstalls. An
|
||||
// exception to this is jumps in micro-op sequences: in this case the jump is
|
||||
// the penultimate instruction in the sequence (ret before addi sp) and we
|
||||
// need to capture the pc mid-uop-sequence.
|
||||
wire hold_pc_on_cir_lock = assert_cir_lock && !(uop_seq && !uop_no_pc_update);
|
||||
wire update_pc_on_cir_unlock = cir_lock_prev && deassert_cir_lock && !(uop_seq && uop_no_pc_update);
|
||||
|
||||
always @ (posedge clk or negedge rst_n) begin
|
||||
if (!rst_n) begin
|
||||
pc <= RESET_VECTOR;
|
||||
|
@ -182,7 +206,7 @@ always @ (posedge clk or negedge rst_n) begin
|
|||
pc <= debug_dpc_wdata;
|
||||
end else if (debug_mode) begin
|
||||
pc <= pc;
|
||||
end else if ((f_jump_now && !assert_cir_lock) || (cir_lock_prev && deassert_cir_lock)) begin
|
||||
end else if ((f_jump_now && !hold_pc_on_cir_lock) || update_pc_on_cir_unlock) begin
|
||||
pc <= f_jump_target;
|
||||
end else if (!d_stall && !cir_lock) begin
|
||||
// If this instruction is a predicted-taken branch (and has not
|
||||
|
|
|
@ -559,8 +559,8 @@ wire [4:0] zcmp_pushpop_rs2 =
|
|||
wire [4:0] zcmp_pushpop_rs1 =
|
||||
uop_ctr < 4'hd ? 5'd02 : // sp (addr base reg)
|
||||
uop_ctr == 4'hd ? 5'd00 : // zero (clear a0)
|
||||
uop_ctr == 4'he ? 5'd02 : // sp (stack adj)
|
||||
5'd01 ; // ra (ret)
|
||||
uop_ctr == 4'he ? 5'd01 : // ra (ret)
|
||||
5'd02 ; // sp (stack adj)
|
||||
|
||||
wire [4:0] zcmp_sa01_r1s = {|next_instr[9:8], ~|next_instr[9:8], next_instr[9:7]};
|
||||
wire [4:0] zcmp_sa01_r2s = {|next_instr[4:3], ~|next_instr[4:3], next_instr[4:2]};
|
||||
|
|
|
@ -17,10 +17,11 @@ module hazard3_instr_decompress #(
|
|||
output reg instr_is_32bit,
|
||||
|
||||
output reg [31:0] instr_out,
|
||||
// Indicate instr_out is a uop, and more uops follow in this sequence.
|
||||
// Should suppress PC update, and null the PC offset in the mepc address
|
||||
// in stage 3.
|
||||
output wire instr_out_uop_nonfinal,
|
||||
// If instruction is a non-final uop, need to suppress PC update, and null
|
||||
// the PC offset in the mepc address in stage 3.
|
||||
output wire instr_out_is_uop,
|
||||
output wire instr_out_is_final_uop,
|
||||
output wire instr_out_uop_no_pc_update,
|
||||
// Indicate instr_out is a uop from the noninterruptible part of a uop
|
||||
// sequence. If one uop is noninterruptible, all following uops until the
|
||||
// end of the sequence are also noninterruptible.
|
||||
|
@ -107,17 +108,20 @@ function [31:0] rfmt_rs2; input [4:0] rs2; begin rfmt_rs2 = {7'h00, rs2, 20'h000
|
|||
//
|
||||
// - 13x lw (counter = 0..12)
|
||||
// - 1x addi to set a0 to zero (counter = 13 ) < atomic section
|
||||
// - 1x addi to adjust sp (counter = 14 ) < atomic section
|
||||
// - 1x jalr to jump through ra (counter = 15 ) < atomic section
|
||||
// - 1x jalr to jump through ra (counter = 14 ) < atomic section
|
||||
// - 1x addi to adjust sp (counter = 15 ) < atomic section
|
||||
|
||||
reg [3:0] uop_ctr;
|
||||
reg [3:0] uop_ctr_nxt;
|
||||
reg in_uop_seq;
|
||||
reg uop_seq_end;
|
||||
reg uop_atomic;
|
||||
reg uop_no_pc_update;
|
||||
|
||||
assign instr_out_uop_nonfinal = in_uop_seq && !uop_seq_end;
|
||||
assign instr_out_is_uop = in_uop_seq;
|
||||
assign instr_out_is_final_uop = uop_seq_end;
|
||||
assign instr_out_uop_atomic = uop_atomic;
|
||||
assign instr_out_uop_no_pc_update = uop_no_pc_update;
|
||||
assign df_uop_step_next = uop_ctr_nxt;
|
||||
|
||||
// The offset from current sp value to the lowest-addressed saved register, +64.
|
||||
|
@ -188,6 +192,7 @@ end else begin: instr_decompress
|
|||
uop_seq_end = 1'b0;
|
||||
in_uop_seq = 1'b0;
|
||||
uop_atomic = 1'b0;
|
||||
uop_no_pc_update = 1'b0;
|
||||
uop_ctr_nxt = uop_ctr;
|
||||
casez (instr_in[15:0])
|
||||
16'h0: invalid = 1'b1;
|
||||
|
@ -295,7 +300,7 @@ end else begin: instr_decompress
|
|||
// Optional Zcmp instructions:
|
||||
`RVOPC_CM_PUSH: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin
|
||||
invalid = 1'b1;
|
||||
end else if (uop_ctr == 4'he) begin
|
||||
end else if (uop_ctr == 4'hf) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_seq_end = 1'b1;
|
||||
uop_ctr_nxt = 4'h0;
|
||||
|
@ -304,14 +309,15 @@ end else begin: instr_decompress
|
|||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
instr_out = zcmp_push_sw_instr;
|
||||
uop_no_pc_update = 1'b1;
|
||||
if (uop_ctr_nxt == zcmp_n_regs) begin
|
||||
uop_ctr_nxt = 4'he;
|
||||
uop_ctr_nxt = 4'hf;
|
||||
end
|
||||
end
|
||||
|
||||
`RVOPC_CM_POP: if (~|EXTENSION_ZCMP || zcmp_rlist < 4'h4) begin
|
||||
invalid = 1'b1;
|
||||
end else if (uop_ctr == 4'he) begin
|
||||
end else if (uop_ctr == 4'hf) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_seq_end = 1'b1;
|
||||
uop_ctr_nxt = 4'h0;
|
||||
|
@ -320,9 +326,10 @@ end else begin: instr_decompress
|
|||
end else begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = zcmp_pop_lw_instr;
|
||||
if (uop_ctr_nxt == zcmp_n_regs) begin
|
||||
uop_ctr_nxt = 4'he;
|
||||
uop_ctr_nxt = 4'hf;
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -334,17 +341,19 @@ end else begin: instr_decompress
|
|||
// executes, they all execute. Having none execute is fine.
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
instr_out = zcmp_pop_stack_adj_instr;
|
||||
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
|
||||
end else if (uop_ctr == 4'hf) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_seq_end = 1'b1;
|
||||
uop_atomic = 1'b1;
|
||||
uop_ctr_nxt = 4'h0;
|
||||
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = zcmp_pop_stack_adj_instr;
|
||||
end else begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
instr_out = zcmp_pop_lw_instr;
|
||||
uop_no_pc_update = 1'b1;
|
||||
if (uop_ctr_nxt == zcmp_n_regs) begin
|
||||
uop_ctr_nxt = 4'he;
|
||||
end
|
||||
|
@ -355,21 +364,24 @@ end else begin: instr_decompress
|
|||
end else if (uop_ctr == 4'hd) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10); // li a0, 0
|
||||
end else if (uop_ctr == 4'he) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_atomic = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
instr_out = zcmp_pop_stack_adj_instr;
|
||||
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
|
||||
end else if (uop_ctr == 4'hf) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_seq_end = 1'b1;
|
||||
uop_atomic = 1'b1;
|
||||
uop_ctr_nxt = 4'h0;
|
||||
instr_out = `RVOPC_NOZ_JALR | rfmt_rs1(5'h1);
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = zcmp_pop_stack_adj_instr;
|
||||
end else begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = zcmp_pop_lw_instr;
|
||||
if (uop_ctr_nxt == zcmp_n_regs) begin
|
||||
uop_ctr_nxt = 4'hd;
|
||||
|
@ -381,6 +393,7 @@ end else begin: instr_decompress
|
|||
end else if (uop_ctr == 4'h0) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(zcmp_sa01_r1s) | rfmt_rs1(5'd10);
|
||||
end else begin
|
||||
in_uop_seq = 1'b1;
|
||||
|
@ -395,6 +408,7 @@ end else begin: instr_decompress
|
|||
end else if (uop_ctr == 4'h0) begin
|
||||
in_uop_seq = 1'b1;
|
||||
uop_ctr_nxt = uop_ctr + 4'h1;
|
||||
uop_no_pc_update = 1'b1;
|
||||
instr_out = `RVOPC_NOZ_ADDI | rfmt_rd(5'd10) | rfmt_rs1(zcmp_sa01_r1s);
|
||||
end else begin
|
||||
in_uop_seq = 1'b1;
|
||||
|
|
Loading…
Reference in New Issue