diff --git a/README.md b/README.md
index 58ba80c..79cb879 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ PicoRV32 - A Size-Optimized RISC-V CPU
 ======================================
 
 PicoRV32 is a CPU core that implements the [RISC-V RV32IMC Instruction Set](http://riscv.org/).
-It can be configured to be a RV32E, RV32I, RV32IC, RV32IM, or RV32IMC core, and optionally
+It can be configured as RV32E, RV32I, RV32IC, RV32IM, or RV32IMC core, and optionally
 contains a built-in interrupt controller.
 
 Tools (gcc, binutils, etc..) can be obtained via the [RISC-V Website](http://riscv.org/download.html#tab_tools).
@@ -29,7 +29,7 @@ PicoRV32 is free and open hardware licensed under the [ISC license](http://en.wi
 Features and Typical Applications
 ---------------------------------
 
-- Small (750-1700 LUTs in 7-Series Xilinx Architecture)
+- Small (750-2000 LUTs in 7-Series Xilinx Architecture)
 - High f<sub>max</sub> (250-450 MHz on 7-Series Xilinx FPGAs)
 - Selectable native memory interface or AXI4-Lite master
 - Optional IRQ support (using a simple custom ISA)
@@ -170,6 +170,12 @@ of 4 bits and then shift in units of 1 bit. This speeds up shift operations,
 but adds additional hardware. Set this parameter to 0 to disable the two-stage
 shift to further reduce the size of the core.
 
+#### BARREL_SHIFTER (default = 0)
+
+By default shift operations are performed by successively shifting by a
+small amount (see `TWO_STAGE_SHIFT` above). With this option set, a barrel
+shifter is used instead instead.
+
 #### TWO_CYCLE_COMPARE (default = 0)
 
 This relaxes the longest data path a bit by adding an additional FF stage
@@ -294,9 +300,15 @@ in 40 cycles and a `MULH[SU|U]` instruction will execute in 72 cycles.
 When `ENABLE_DIV` is activated, then a `DIV[U]/REM[U]` instruction will
 execute in 40 cycles.
 
-Dhrystone benchmark results: 0.391 DMIPS/MHz (688 Dhrystones/Second/MHz)
+When `BARREL_SHIFTER` is activated, a shift operation takes as long as
+any other ALU operation.
 
-For the Dhrystone benchmark the average CPI is 4.110.
+The following dhrystone benchmark results are for a core with enabled
+`ENABLE_MUL`, `ENABLE_DIV`, and `BARREL_SHIFTER` options.
+
+Dhrystone benchmark results: 0.399 DMIPS/MHz (702 Dhrystones/Second/MHz)
+
+For the Dhrystone benchmark the average CPI is 4.030.
 
 
 PicoRV32 Native Memory Interface
@@ -586,7 +598,7 @@ once in advance.
 Evaluation: Timing and Utilization on Xilinx 7-Series FPGAs
 -----------------------------------------------------------
 
-The following evaluations have been performed with Vivado 2015.1.
+The following evaluations have been performed with Vivado 2015.4.
 
 #### Timing on Xilinx 7-Series FPGAs
 
@@ -622,7 +634,7 @@ for the following three cores:
 - **PicoRV32 (regular):** The `picorv32` module in its default configuration.
 
 - **PicoRV32 (large):** The `picorv32` module with enabled PCPI, IRQ, MUL,
-  DIV, and COMPRESSED_ISA features.
+  DIV, BARREL_SHIFTER, and COMPRESSED_ISA features.
 
 See `make area` in [scripts/vivado/](scripts/vivado/).
 
diff --git a/dhrystone/testbench.v b/dhrystone/testbench.v
index 6251472..6a03ca0 100644
--- a/dhrystone/testbench.v
+++ b/dhrystone/testbench.v
@@ -27,6 +27,7 @@ module testbench;
 	wire [3:0] mem_la_wstrb;
 
 	picorv32 #(
+		.BARREL_SHIFTER(1),
 		.ENABLE_MUL(1),
 		.ENABLE_DIV(1)
 	) uut (
diff --git a/picorv32.v b/picorv32.v
index 4f70a74..9a3410f 100644
--- a/picorv32.v
+++ b/picorv32.v
@@ -43,6 +43,7 @@ module picorv32 #(
 	parameter [ 0:0] ENABLE_REGS_DUALPORT = 1,
 	parameter [ 0:0] LATCHED_MEM_RDATA = 0,
 	parameter [ 0:0] TWO_STAGE_SHIFT = 1,
+	parameter [ 0:0] BARREL_SHIFTER = 0,
 	parameter [ 0:0] TWO_CYCLE_COMPARE = 0,
 	parameter [ 0:0] TWO_CYCLE_ALU = 0,
 	parameter [ 0:0] COMPRESSED_ISA = 0,
@@ -889,6 +890,7 @@ module picorv32 #(
 	reg alu_wait, alu_wait_2;
 
 	reg [31:0] alu_add_sub;
+	reg [31:0] alu_shl, alu_shr;
 	reg alu_eq, alu_ltu, alu_lts;
 
 	generate if (TWO_CYCLE_ALU) begin
@@ -897,6 +899,8 @@ module picorv32 #(
 			alu_eq <= reg_op1 == reg_op2;
 			alu_lts <= $signed(reg_op1) < $signed(reg_op2);
 			alu_ltu <= reg_op1 < reg_op2;
+			alu_shl <= reg_op1 << reg_op2[4:0];
+			alu_shr <= $signed({instr_sra || instr_srai ? reg_op1[31] : 1'b0, reg_op1}) >>> reg_op2[4:0];
 		end
 	end else begin
 		always @* begin
@@ -904,6 +908,8 @@ module picorv32 #(
 			alu_eq = reg_op1 == reg_op2;
 			alu_lts = $signed(reg_op1) < $signed(reg_op2);
 			alu_ltu = reg_op1 < reg_op2;
+			alu_shl = reg_op1 << reg_op2[4:0];
+			alu_shr = $signed({instr_sra || instr_srai ? reg_op1[31] : 1'b0, reg_op1}) >>> reg_op2[4:0];
 		end
 	end endgenerate
 
@@ -938,6 +944,10 @@ module picorv32 #(
 				alu_out = reg_op1 | reg_op2;
 			instr_andi || instr_and:
 				alu_out = reg_op1 & reg_op2;
+			BARREL_SHIFTER && (instr_sll || instr_slli):
+				alu_out = alu_shl;
+			BARREL_SHIFTER && (instr_srl || instr_srli || instr_sra || instr_srai):
+				alu_out = alu_shr;
 		endcase
 	end
 
@@ -1208,16 +1218,16 @@ module picorv32 #(
 						cpu_state <= cpu_state_ldmem;
 						mem_do_rinst <= 1;
 					end
-					is_slli_srli_srai: begin
+					is_slli_srli_srai && !BARREL_SHIFTER: begin
 						`debug($display("LD_RS1: %2d 0x%08x", decoded_rs1, decoded_rs1 ? cpuregs[decoded_rs1] : 0);)
 						reg_op1 <= decoded_rs1 ? cpuregs[decoded_rs1] : 0;
 						reg_sh <= decoded_rs2;
 						cpu_state <= cpu_state_shift;
 					end
-					is_jalr_addi_slti_sltiu_xori_ori_andi: begin
+					is_jalr_addi_slti_sltiu_xori_ori_andi, is_slli_srli_srai && BARREL_SHIFTER: begin
 						`debug($display("LD_RS1: %2d 0x%08x", decoded_rs1, decoded_rs1 ? cpuregs[decoded_rs1] : 0);)
 						reg_op1 <= decoded_rs1 ? cpuregs[decoded_rs1] : 0;
-						reg_op2 <= decoded_imm;
+						reg_op2 <= is_slli_srli_srai && BARREL_SHIFTER ? decoded_rs2 : decoded_imm;
 						if (TWO_CYCLE_ALU)
 							alu_wait <= 1;
 						else
@@ -1237,7 +1247,7 @@ module picorv32 #(
 									cpu_state <= cpu_state_stmem;
 									mem_do_rinst <= 1;
 								end
-								is_sll_srl_sra: begin
+								is_sll_srl_sra && !BARREL_SHIFTER: begin
 									cpu_state <= cpu_state_shift;
 								end
 								default: begin
@@ -1284,7 +1294,7 @@ module picorv32 #(
 						cpu_state <= cpu_state_stmem;
 						mem_do_rinst <= 1;
 					end
-					is_sll_srl_sra: begin
+					is_sll_srl_sra && !BARREL_SHIFTER: begin
 						cpu_state <= cpu_state_shift;
 					end
 					default: begin
@@ -1299,8 +1309,6 @@ module picorv32 #(
 			end
 
 			cpu_state_exec: begin
-				latched_store <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0;
-				latched_branch <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0;
 				reg_out <= reg_pc + decoded_imm;
 				if ((TWO_CYCLE_ALU || TWO_CYCLE_COMPARE) && (alu_wait || alu_wait_2)) begin
 					mem_do_rinst <= mem_do_prefetch && !alu_wait_2;
@@ -1308,6 +1316,8 @@ module picorv32 #(
 				end else
 				if (is_beq_bne_blt_bge_bltu_bgeu) begin
 					latched_rd <= 0;
+					latched_store <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0;
+					latched_branch <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0;
 					if (mem_done)
 						cpu_state <= cpu_state_fetch;
 					if (TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0) begin
@@ -1704,6 +1714,7 @@ module picorv32_axi #(
 	parameter [ 0:0] ENABLE_REGS_16_31 = 1,
 	parameter [ 0:0] ENABLE_REGS_DUALPORT = 1,
 	parameter [ 0:0] TWO_STAGE_SHIFT = 1,
+	parameter [ 0:0] BARREL_SHIFTER = 0,
 	parameter [ 0:0] TWO_CYCLE_COMPARE = 0,
 	parameter [ 0:0] TWO_CYCLE_ALU = 0,
 	parameter [ 0:0] COMPRESSED_ISA = 0,
@@ -1803,6 +1814,7 @@ module picorv32_axi #(
 		.ENABLE_REGS_16_31   (ENABLE_REGS_16_31   ),
 		.ENABLE_REGS_DUALPORT(ENABLE_REGS_DUALPORT),
 		.TWO_STAGE_SHIFT     (TWO_STAGE_SHIFT     ),
+		.BARREL_SHIFTER      (BARREL_SHIFTER      ),
 		.TWO_CYCLE_COMPARE   (TWO_CYCLE_COMPARE   ),
 		.TWO_CYCLE_ALU       (TWO_CYCLE_ALU       ),
 		.COMPRESSED_ISA      (COMPRESSED_ISA      ),
diff --git a/scripts/vivado/synth_area_top.v b/scripts/vivado/synth_area_top.v
index c2eddeb..6298a86 100644
--- a/scripts/vivado/synth_area_top.v
+++ b/scripts/vivado/synth_area_top.v
@@ -105,6 +105,7 @@ module top_large (
 );
 	picorv32 #(
 		.COMPRESSED_ISA(1),
+		.BARREL_SHIFTER(1),
 		.ENABLE_PCPI(1),
 		.ENABLE_MUL(1),
 		.ENABLE_IRQ(1)