Improve performance of lowered chlo.pow with integers

The new lowering takes 6 iterations of a loop always rather than iterating the exponent's number of times.

PiperOrigin-RevId: 355131133
This commit is contained in:
Tres Popp 2021-02-02 03:27:38 -08:00 committed by TensorFlow MLIR Team
parent 0458ae9a22
commit ae722a883f
2 changed files with 62 additions and 40 deletions

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/IR/TypeUtilities.h"
namespace mlir {
@ -588,6 +589,7 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::PowOp>(Location loc,
ArrayRef<Value> args,
OpBuilder* b) {
lmhlo::PowOp::Adaptor adaptor(args);
auto lb = ImplicitLocOpBuilder(loc, *b);
// Floating point can use std::powf
auto result_type = result_types.front();
if (result_type.isa<::mlir::FloatType>())
@ -597,53 +599,66 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::PowOp>(Location loc,
assert(result_type.isa<::mlir::IntegerType>() &&
"only float and integer `pow` is supported right now");
// There is no powi, so lower to a simple product.
Value neg_one =
b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, -1));
Value zero = b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, 0));
Value one = b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, 1));
Value two = b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, 2));
// Exponentiation by squaring:
// https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
Value neg_one = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, -1));
Value zero = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, 0));
Value one = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, 1));
Value two = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, 2));
Value step = lb.create<ConstantIndexOp>(1);
Value lowerBound = lb.create<ConstantIndexOp>(0);
// Everything else would overflow for any exponent > 1, as 2^64
// is the larget possible exponent for a 64-bit integer, and
// that's 1 << 6.
Value upperBound = lb.create<ConstantIndexOp>(6);
auto original_base = adaptor.lhs();
auto original_exponent = adaptor.rhs();
Value lowerBound = b->create<ConstantIndexOp>(loc, 0);
Value upperBound =
b->create<IndexCastOp>(loc, adaptor.rhs(), b->getIndexType());
Value step = b->create<ConstantIndexOp>(loc, 1);
Value for_result =
b->create<scf::ForOp>(
loc, lowerBound, upperBound, step, llvm::makeArrayRef(one),
[&](OpBuilder& b, Location l, Value v, ValueRange iters) {
Value prod =
b.create<::mlir::MulIOp>(l, adaptor.lhs(), iters.front());
b.create<scf::YieldOp>(l, prod);
})
Value accum =
lb.create<scf::ForOp>(
lowerBound, upperBound, step,
SmallVector<Value>({one, original_base, original_exponent}),
[&](OpBuilder& b, Location, Value v, ValueRange iters) {
Value accum = iters[0];
Value base = iters[1];
Value exponent = iters[2];
Value condition = b.create<CmpIOp>(
loc, CmpIPredicate::eq,
b.create<::mlir::AndOp>(loc, exponent, one), one);
Value multiplied = b.create<::mlir::MulIOp>(loc, accum, base);
accum =
b.create<::mlir::SelectOp>(loc, condition, multiplied, accum);
base = b.create<::mlir::MulIOp>(loc, base, base);
exponent =
b.create<::mlir::UnsignedShiftRightOp>(loc, exponent, one);
b.create<scf::YieldOp>(
loc, SmallVector<Value>({accum, base, exponent}));
})
.getResult(0);
Value rhs_is_even =
b->create<CmpIOp>(loc, CmpIPredicate::eq,
b->create<SignedRemIOp>(loc, adaptor.rhs(), two), zero);
Value rhs_is_even = lb.create<CmpIOp>(
CmpIPredicate::eq, lb.create<SignedRemIOp>(adaptor.rhs(), two), zero);
Value rhs_is_negative =
b->create<CmpIOp>(loc, CmpIPredicate::slt, adaptor.rhs(), zero);
Value lhs_is_one =
b->create<CmpIOp>(loc, CmpIPredicate::eq, adaptor.lhs(), one);
lb.create<CmpIOp>(CmpIPredicate::slt, adaptor.rhs(), zero);
Value lhs_is_one = lb.create<CmpIOp>(CmpIPredicate::eq, adaptor.lhs(), one);
Value lhs_is_neg_one =
b->create<CmpIOp>(loc, CmpIPredicate::eq, adaptor.lhs(), neg_one);
lb.create<CmpIOp>(CmpIPredicate::eq, adaptor.lhs(), neg_one);
// The for_result is correct when the rhs is non-negative. When rhs is
// The accum is correct when the rhs is non-negative. When rhs is
// negative, we return 0 for integer, with the exception of lhs values of 1
// and -1 which have integer results for negative exponents. Specifically, the
// calulation is the following:
//
// - Return for_result if the rhs is not negative.
// - Return accum if the rhs is not negative.
// - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
// - Return 1 if lhs is 1.
// - Else return 0.
Value if_lhs_is_one = b->create<::mlir::SelectOp>(loc, lhs_is_one, one, zero);
Value if_lhs_is_neg_one = b->create<::mlir::SelectOp>(
loc, lhs_is_neg_one,
b->create<::mlir::SelectOp>(loc, rhs_is_even, one, neg_one),
Value if_lhs_is_one = lb.create<::mlir::SelectOp>(lhs_is_one, one, zero);
Value if_lhs_is_neg_one = lb.create<::mlir::SelectOp>(
lhs_is_neg_one, lb.create<::mlir::SelectOp>(rhs_is_even, one, neg_one),
if_lhs_is_one);
return b->create<::mlir::SelectOp>(loc, rhs_is_negative, if_lhs_is_neg_one,
for_result);
return lb.create<::mlir::SelectOp>(rhs_is_negative, if_lhs_is_neg_one, accum);
}
template <>

View File

@ -851,12 +851,19 @@ func @integer_pow(%lhs: tensor<2x2xi32>,
// CHECK: ^{{[a-z0-9_]*}}
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: i32
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: i32
// CHECK: %[[UPPER:.*]] = index_cast %[[ARG1]]
// CHECK: %[[FOR_RESULT:.*]] = scf.for {{.*}} to %[[UPPER]]
// CHECK-SAME: step %c1{{[a-zA-Z0-9_]*}}
// CHECK-SAME: iter_args(%[[ITER:.*]] = %c1{{.*}}) -> (i32) {
// CHECK: %[[ACCUM:[a-zA-Z0-9_]*]] = muli %[[ARG0]], %[[ITER]]
// CHECK: scf.yield %[[ACCUM]]
// CHECK: %[[FOR_RESULT:[a-zA-Z0-9_]*]]:3 = scf.for {{.*}} to %c6 step %c1
// CHECK-SAME: iter_args(
// CHECK-SAME: %[[ITER0:.*]] = %c1
// CHECK-SAME: %[[ITER1:.*]] = %[[ARG0]]
// CHECK-SAME: %[[ITER2:.*]] = %[[ARG1]]
// CHECK-SAME: ) -> (i32, i32, i32) {
// CHECK: %[[AND:[a-zA-Z0-9_]*]] = and %[[ITER2]], %c1
// CHECK: %[[COND:[a-zA-Z0-9_]*]] = cmpi eq, %[[AND]], %c1
// CHECK: %[[MUL:[a-zA-Z0-9_]*]] = muli %[[ITER0]], %[[ITER1]]
// CHECK: %[[ACCUM:[a-zA-Z0-9_]*]] = select %[[COND]], %[[MUL]], %[[ITER0]]
// CHECK: %[[BASE:[a-zA-Z0-9_]*]] = muli %[[ITER1]], %[[ITER1]]
// CHECK: %[[EXP:[a-zA-Z0-9_]*]] = shift_right_unsigned %[[ITER2]], %c1
// CHECK: scf.yield %[[ACCUM]], %[[BASE]], %[[EXP]]
// CHECK: %[[RHS_PARITY:.*]] = remi_signed %[[ARG1]], %c2
// CHECK: %[[RHS_EVEN:.*]] = cmpi eq, %[[RHS_PARITY]], %c0
// CHECK: %[[RHS_NEG:.*]] = cmpi slt, %[[ARG1]], %c0
@ -865,7 +872,7 @@ func @integer_pow(%lhs: tensor<2x2xi32>,
// CHECK: %[[VAL5:.*]] = select %[[LHS_ONE]], %c1_i32, %c0
// CHECK: %[[VAL6:.*]] = select %[[RHS_EVEN]], %c1{{.*}}, %c-1
// CHECK: %[[VAL7:.*]] = select %[[LHS_NEG_ONE]], %[[VAL6]], %[[VAL5]]
// CHECK: %[[RESULT:.*]] = select %[[RHS_NEG]], %[[VAL7]], %[[FOR_RESULT]]
// CHECK: %[[RESULT:.*]] = select %[[RHS_NEG]], %[[VAL7]], %[[FOR_RESULT]]#0
// CHECK: linalg.yield %[[RESULT]]
%0 = "mhlo.power"(%lhs, %rhs) : (tensor<2x2xi32>,
tensor<2x2xi32>) -> tensor<2x2xi32>