Improve performance of lowered chlo.pow with integers

The new lowering takes 6 iterations of a loop always rather than iterating the exponent's number of times. PiperOrigin-RevId: 355131133
2021-02-02 03:27:38 -08:00 · 2021-02-02 03:27:38 -08:00 · ae722a883f
parent 0458ae9a22
commit ae722a883f
2 changed files with 62 additions and 40 deletions
--- a/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"

 namespace mlir {
@ -588,6 +589,7 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::PowOp>(Location loc,
                                                  ArrayRef<Value> args,
                                                  OpBuilder* b) {
  lmhlo::PowOp::Adaptor adaptor(args);
+  auto lb = ImplicitLocOpBuilder(loc, *b);
  // Floating point can use std::powf
  auto result_type = result_types.front();
  if (result_type.isa<::mlir::FloatType>())
@ -597,53 +599,66 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::PowOp>(Location loc,
  assert(result_type.isa<::mlir::IntegerType>() &&
         "only float and integer `pow` is supported right now");

-  // There is no powi, so lower to a simple product.
-  Value neg_one =
-      b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, -1));
-  Value zero = b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, 0));
-  Value one = b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, 1));
-  Value two = b->create<ConstantOp>(loc, b->getIntegerAttr(result_type, 2));
+  // Exponentiation by squaring:
+  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
+  Value neg_one = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, -1));
+  Value zero = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, 0));
+  Value one = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, 1));
+  Value two = lb.create<ConstantOp>(lb.getIntegerAttr(result_type, 2));
+  Value step = lb.create<ConstantIndexOp>(1);
+  Value lowerBound = lb.create<ConstantIndexOp>(0);
+  // Everything else would overflow for any exponent > 1, as 2^64
+  // is the larget possible exponent for a 64-bit integer, and
+  // that's 1 << 6.
+  Value upperBound = lb.create<ConstantIndexOp>(6);
+  auto original_base = adaptor.lhs();
+  auto original_exponent = adaptor.rhs();

-  Value lowerBound = b->create<ConstantIndexOp>(loc, 0);
-  Value upperBound =
-      b->create<IndexCastOp>(loc, adaptor.rhs(), b->getIndexType());
-  Value step = b->create<ConstantIndexOp>(loc, 1);
-  Value for_result =
-      b->create<scf::ForOp>(
-           loc, lowerBound, upperBound, step, llvm::makeArrayRef(one),
-           [&](OpBuilder& b, Location l, Value v, ValueRange iters) {
-             Value prod =
-                 b.create<::mlir::MulIOp>(l, adaptor.lhs(), iters.front());
-             b.create<scf::YieldOp>(l, prod);
-           })
+  Value accum =
+      lb.create<scf::ForOp>(
+            lowerBound, upperBound, step,
+            SmallVector<Value>({one, original_base, original_exponent}),
+            [&](OpBuilder& b, Location, Value v, ValueRange iters) {
+              Value accum = iters[0];
+              Value base = iters[1];
+              Value exponent = iters[2];
+
+              Value condition = b.create<CmpIOp>(
+                  loc, CmpIPredicate::eq,
+                  b.create<::mlir::AndOp>(loc, exponent, one), one);
+              Value multiplied = b.create<::mlir::MulIOp>(loc, accum, base);
+              accum =
+                  b.create<::mlir::SelectOp>(loc, condition, multiplied, accum);
+              base = b.create<::mlir::MulIOp>(loc, base, base);
+              exponent =
+                  b.create<::mlir::UnsignedShiftRightOp>(loc, exponent, one);
+              b.create<scf::YieldOp>(
+                  loc, SmallVector<Value>({accum, base, exponent}));
+            })
          .getResult(0);

-  Value rhs_is_even =
-      b->create<CmpIOp>(loc, CmpIPredicate::eq,
-                        b->create<SignedRemIOp>(loc, adaptor.rhs(), two), zero);
+  Value rhs_is_even = lb.create<CmpIOp>(
+      CmpIPredicate::eq, lb.create<SignedRemIOp>(adaptor.rhs(), two), zero);
  Value rhs_is_negative =
-      b->create<CmpIOp>(loc, CmpIPredicate::slt, adaptor.rhs(), zero);
-  Value lhs_is_one =
-      b->create<CmpIOp>(loc, CmpIPredicate::eq, adaptor.lhs(), one);
+      lb.create<CmpIOp>(CmpIPredicate::slt, adaptor.rhs(), zero);
+  Value lhs_is_one = lb.create<CmpIOp>(CmpIPredicate::eq, adaptor.lhs(), one);
  Value lhs_is_neg_one =
-      b->create<CmpIOp>(loc, CmpIPredicate::eq, adaptor.lhs(), neg_one);
+      lb.create<CmpIOp>(CmpIPredicate::eq, adaptor.lhs(), neg_one);

-  // The for_result is correct when the rhs is non-negative. When rhs is
+  // The accum is correct when the rhs is non-negative. When rhs is
  // negative, we return 0 for integer, with the exception of lhs values of 1
  // and -1 which have integer results for negative exponents. Specifically, the
  // calulation is the following:
  //
-  // - Return for_result if the rhs is not negative.
+  // - Return accum if the rhs is not negative.
  // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
  // - Return 1 if lhs is 1.
  // - Else return 0.
-  Value if_lhs_is_one = b->create<::mlir::SelectOp>(loc, lhs_is_one, one, zero);
-  Value if_lhs_is_neg_one = b->create<::mlir::SelectOp>(
-      loc, lhs_is_neg_one,
-      b->create<::mlir::SelectOp>(loc, rhs_is_even, one, neg_one),
+  Value if_lhs_is_one = lb.create<::mlir::SelectOp>(lhs_is_one, one, zero);
+  Value if_lhs_is_neg_one = lb.create<::mlir::SelectOp>(
+      lhs_is_neg_one, lb.create<::mlir::SelectOp>(rhs_is_even, one, neg_one),
      if_lhs_is_one);
-  return b->create<::mlir::SelectOp>(loc, rhs_is_negative, if_lhs_is_neg_one,
-                                     for_result);
+  return lb.create<::mlir::SelectOp>(rhs_is_negative, if_lhs_is_neg_one, accum);
 }

 template <>
--- a/tests/hlo-legalize-to-linalg.mlir
+++ b/tests/hlo-legalize-to-linalg.mlir
@ -851,12 +851,19 @@ func @integer_pow(%lhs: tensor<2x2xi32>,
  // CHECK: ^{{[a-z0-9_]*}}
  // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: i32
  // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: i32
-  // CHECK: %[[UPPER:.*]] = index_cast %[[ARG1]]
-  // CHECK: %[[FOR_RESULT:.*]] = scf.for {{.*}} to %[[UPPER]]
-  // CHECK-SAME: step %c1{{[a-zA-Z0-9_]*}}
-  // CHECK-SAME: iter_args(%[[ITER:.*]] = %c1{{.*}}) -> (i32) {
-  //   CHECK: %[[ACCUM:[a-zA-Z0-9_]*]] = muli %[[ARG0]], %[[ITER]]
-  //   CHECK: scf.yield %[[ACCUM]]
+  // CHECK: %[[FOR_RESULT:[a-zA-Z0-9_]*]]:3 = scf.for {{.*}} to %c6 step %c1
+  // CHECK-SAME: iter_args(
+  // CHECK-SAME:   %[[ITER0:.*]] = %c1
+  // CHECK-SAME:   %[[ITER1:.*]] = %[[ARG0]]
+  // CHECK-SAME:   %[[ITER2:.*]] = %[[ARG1]]
+  // CHECK-SAME: ) -> (i32, i32, i32) {
+  //   CHECK: %[[AND:[a-zA-Z0-9_]*]] = and %[[ITER2]], %c1
+  //   CHECK: %[[COND:[a-zA-Z0-9_]*]] = cmpi eq, %[[AND]], %c1
+  //   CHECK: %[[MUL:[a-zA-Z0-9_]*]] = muli %[[ITER0]], %[[ITER1]]
+  //   CHECK: %[[ACCUM:[a-zA-Z0-9_]*]] = select %[[COND]], %[[MUL]], %[[ITER0]]
+  //   CHECK: %[[BASE:[a-zA-Z0-9_]*]] = muli %[[ITER1]], %[[ITER1]]
+  //   CHECK: %[[EXP:[a-zA-Z0-9_]*]] = shift_right_unsigned %[[ITER2]], %c1
+  //   CHECK: scf.yield %[[ACCUM]], %[[BASE]], %[[EXP]]
  // CHECK: %[[RHS_PARITY:.*]] = remi_signed %[[ARG1]], %c2
  // CHECK: %[[RHS_EVEN:.*]] = cmpi eq, %[[RHS_PARITY]], %c0
  // CHECK: %[[RHS_NEG:.*]] = cmpi slt, %[[ARG1]], %c0
@ -865,7 +872,7 @@ func @integer_pow(%lhs: tensor<2x2xi32>,
  // CHECK: %[[VAL5:.*]] = select %[[LHS_ONE]], %c1_i32, %c0
  // CHECK: %[[VAL6:.*]] = select %[[RHS_EVEN]], %c1{{.*}}, %c-1
  // CHECK: %[[VAL7:.*]] = select %[[LHS_NEG_ONE]], %[[VAL6]], %[[VAL5]]
-  // CHECK: %[[RESULT:.*]] = select %[[RHS_NEG]], %[[VAL7]], %[[FOR_RESULT]]
+  // CHECK: %[[RESULT:.*]] = select %[[RHS_NEG]], %[[VAL7]], %[[FOR_RESULT]]#0
  // CHECK: linalg.yield %[[RESULT]]
  %0 = "mhlo.power"(%lhs, %rhs) : (tensor<2x2xi32>,
                                   tensor<2x2xi32>) -> tensor<2x2xi32>