diff --git a/README.md b/README.md index 58ba80c..79cb879 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ PicoRV32 - A Size-Optimized RISC-V CPU ====================================== PicoRV32 is a CPU core that implements the [RISC-V RV32IMC Instruction Set](http://riscv.org/). -It can be configured to be a RV32E, RV32I, RV32IC, RV32IM, or RV32IMC core, and optionally +It can be configured as RV32E, RV32I, RV32IC, RV32IM, or RV32IMC core, and optionally contains a built-in interrupt controller. Tools (gcc, binutils, etc..) can be obtained via the [RISC-V Website](http://riscv.org/download.html#tab_tools). @@ -29,7 +29,7 @@ PicoRV32 is free and open hardware licensed under the [ISC license](http://en.wi Features and Typical Applications --------------------------------- -- Small (750-1700 LUTs in 7-Series Xilinx Architecture) +- Small (750-2000 LUTs in 7-Series Xilinx Architecture) - High fmax (250-450 MHz on 7-Series Xilinx FPGAs) - Selectable native memory interface or AXI4-Lite master - Optional IRQ support (using a simple custom ISA) @@ -170,6 +170,12 @@ of 4 bits and then shift in units of 1 bit. This speeds up shift operations, but adds additional hardware. Set this parameter to 0 to disable the two-stage shift to further reduce the size of the core. +#### BARREL_SHIFTER (default = 0) + +By default shift operations are performed by successively shifting by a +small amount (see `TWO_STAGE_SHIFT` above). With this option set, a barrel +shifter is used instead instead. + #### TWO_CYCLE_COMPARE (default = 0) This relaxes the longest data path a bit by adding an additional FF stage @@ -294,9 +300,15 @@ in 40 cycles and a `MULH[SU|U]` instruction will execute in 72 cycles. When `ENABLE_DIV` is activated, then a `DIV[U]/REM[U]` instruction will execute in 40 cycles. -Dhrystone benchmark results: 0.391 DMIPS/MHz (688 Dhrystones/Second/MHz) +When `BARREL_SHIFTER` is activated, a shift operation takes as long as +any other ALU operation. -For the Dhrystone benchmark the average CPI is 4.110. +The following dhrystone benchmark results are for a core with enabled +`ENABLE_MUL`, `ENABLE_DIV`, and `BARREL_SHIFTER` options. + +Dhrystone benchmark results: 0.399 DMIPS/MHz (702 Dhrystones/Second/MHz) + +For the Dhrystone benchmark the average CPI is 4.030. PicoRV32 Native Memory Interface @@ -586,7 +598,7 @@ once in advance. Evaluation: Timing and Utilization on Xilinx 7-Series FPGAs ----------------------------------------------------------- -The following evaluations have been performed with Vivado 2015.1. +The following evaluations have been performed with Vivado 2015.4. #### Timing on Xilinx 7-Series FPGAs @@ -622,7 +634,7 @@ for the following three cores: - **PicoRV32 (regular):** The `picorv32` module in its default configuration. - **PicoRV32 (large):** The `picorv32` module with enabled PCPI, IRQ, MUL, - DIV, and COMPRESSED_ISA features. + DIV, BARREL_SHIFTER, and COMPRESSED_ISA features. See `make area` in [scripts/vivado/](scripts/vivado/). diff --git a/dhrystone/testbench.v b/dhrystone/testbench.v index 6251472..6a03ca0 100644 --- a/dhrystone/testbench.v +++ b/dhrystone/testbench.v @@ -27,6 +27,7 @@ module testbench; wire [3:0] mem_la_wstrb; picorv32 #( + .BARREL_SHIFTER(1), .ENABLE_MUL(1), .ENABLE_DIV(1) ) uut ( diff --git a/picorv32.v b/picorv32.v index 4f70a74..9a3410f 100644 --- a/picorv32.v +++ b/picorv32.v @@ -43,6 +43,7 @@ module picorv32 #( parameter [ 0:0] ENABLE_REGS_DUALPORT = 1, parameter [ 0:0] LATCHED_MEM_RDATA = 0, parameter [ 0:0] TWO_STAGE_SHIFT = 1, + parameter [ 0:0] BARREL_SHIFTER = 0, parameter [ 0:0] TWO_CYCLE_COMPARE = 0, parameter [ 0:0] TWO_CYCLE_ALU = 0, parameter [ 0:0] COMPRESSED_ISA = 0, @@ -889,6 +890,7 @@ module picorv32 #( reg alu_wait, alu_wait_2; reg [31:0] alu_add_sub; + reg [31:0] alu_shl, alu_shr; reg alu_eq, alu_ltu, alu_lts; generate if (TWO_CYCLE_ALU) begin @@ -897,6 +899,8 @@ module picorv32 #( alu_eq <= reg_op1 == reg_op2; alu_lts <= $signed(reg_op1) < $signed(reg_op2); alu_ltu <= reg_op1 < reg_op2; + alu_shl <= reg_op1 << reg_op2[4:0]; + alu_shr <= $signed({instr_sra || instr_srai ? reg_op1[31] : 1'b0, reg_op1}) >>> reg_op2[4:0]; end end else begin always @* begin @@ -904,6 +908,8 @@ module picorv32 #( alu_eq = reg_op1 == reg_op2; alu_lts = $signed(reg_op1) < $signed(reg_op2); alu_ltu = reg_op1 < reg_op2; + alu_shl = reg_op1 << reg_op2[4:0]; + alu_shr = $signed({instr_sra || instr_srai ? reg_op1[31] : 1'b0, reg_op1}) >>> reg_op2[4:0]; end end endgenerate @@ -938,6 +944,10 @@ module picorv32 #( alu_out = reg_op1 | reg_op2; instr_andi || instr_and: alu_out = reg_op1 & reg_op2; + BARREL_SHIFTER && (instr_sll || instr_slli): + alu_out = alu_shl; + BARREL_SHIFTER && (instr_srl || instr_srli || instr_sra || instr_srai): + alu_out = alu_shr; endcase end @@ -1208,16 +1218,16 @@ module picorv32 #( cpu_state <= cpu_state_ldmem; mem_do_rinst <= 1; end - is_slli_srli_srai: begin + is_slli_srli_srai && !BARREL_SHIFTER: begin `debug($display("LD_RS1: %2d 0x%08x", decoded_rs1, decoded_rs1 ? cpuregs[decoded_rs1] : 0);) reg_op1 <= decoded_rs1 ? cpuregs[decoded_rs1] : 0; reg_sh <= decoded_rs2; cpu_state <= cpu_state_shift; end - is_jalr_addi_slti_sltiu_xori_ori_andi: begin + is_jalr_addi_slti_sltiu_xori_ori_andi, is_slli_srli_srai && BARREL_SHIFTER: begin `debug($display("LD_RS1: %2d 0x%08x", decoded_rs1, decoded_rs1 ? cpuregs[decoded_rs1] : 0);) reg_op1 <= decoded_rs1 ? cpuregs[decoded_rs1] : 0; - reg_op2 <= decoded_imm; + reg_op2 <= is_slli_srli_srai && BARREL_SHIFTER ? decoded_rs2 : decoded_imm; if (TWO_CYCLE_ALU) alu_wait <= 1; else @@ -1237,7 +1247,7 @@ module picorv32 #( cpu_state <= cpu_state_stmem; mem_do_rinst <= 1; end - is_sll_srl_sra: begin + is_sll_srl_sra && !BARREL_SHIFTER: begin cpu_state <= cpu_state_shift; end default: begin @@ -1284,7 +1294,7 @@ module picorv32 #( cpu_state <= cpu_state_stmem; mem_do_rinst <= 1; end - is_sll_srl_sra: begin + is_sll_srl_sra && !BARREL_SHIFTER: begin cpu_state <= cpu_state_shift; end default: begin @@ -1299,8 +1309,6 @@ module picorv32 #( end cpu_state_exec: begin - latched_store <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0; - latched_branch <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0; reg_out <= reg_pc + decoded_imm; if ((TWO_CYCLE_ALU || TWO_CYCLE_COMPARE) && (alu_wait || alu_wait_2)) begin mem_do_rinst <= mem_do_prefetch && !alu_wait_2; @@ -1308,6 +1316,8 @@ module picorv32 #( end else if (is_beq_bne_blt_bge_bltu_bgeu) begin latched_rd <= 0; + latched_store <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0; + latched_branch <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0; if (mem_done) cpu_state <= cpu_state_fetch; if (TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0) begin @@ -1704,6 +1714,7 @@ module picorv32_axi #( parameter [ 0:0] ENABLE_REGS_16_31 = 1, parameter [ 0:0] ENABLE_REGS_DUALPORT = 1, parameter [ 0:0] TWO_STAGE_SHIFT = 1, + parameter [ 0:0] BARREL_SHIFTER = 0, parameter [ 0:0] TWO_CYCLE_COMPARE = 0, parameter [ 0:0] TWO_CYCLE_ALU = 0, parameter [ 0:0] COMPRESSED_ISA = 0, @@ -1803,6 +1814,7 @@ module picorv32_axi #( .ENABLE_REGS_16_31 (ENABLE_REGS_16_31 ), .ENABLE_REGS_DUALPORT(ENABLE_REGS_DUALPORT), .TWO_STAGE_SHIFT (TWO_STAGE_SHIFT ), + .BARREL_SHIFTER (BARREL_SHIFTER ), .TWO_CYCLE_COMPARE (TWO_CYCLE_COMPARE ), .TWO_CYCLE_ALU (TWO_CYCLE_ALU ), .COMPRESSED_ISA (COMPRESSED_ISA ), diff --git a/scripts/vivado/synth_area_top.v b/scripts/vivado/synth_area_top.v index c2eddeb..6298a86 100644 --- a/scripts/vivado/synth_area_top.v +++ b/scripts/vivado/synth_area_top.v @@ -105,6 +105,7 @@ module top_large ( ); picorv32 #( .COMPRESSED_ISA(1), + .BARREL_SHIFTER(1), .ENABLE_PCPI(1), .ENABLE_MUL(1), .ENABLE_IRQ(1)