Move XLA-independent transforms to the new MLIR-HLO directory

This is as straighforward as possible, more cleanup/rewrite to come. PiperOrigin-RevId: 319849713
2020-07-06 20:57:00 +00:00 · 2020-07-06 20:57:00 +00:00 · 31dc1b21eb
parent 72010faaa7
commit 31dc1b21eb
37 changed files with 7031 additions and 12 deletions
--- a/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
+++ b/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_CHLO_OPS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_CHLO_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_CHLO_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_CHLO_OPS_H_
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringRef.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Dialect.h"
@ -42,4 +42,4 @@ class XlaHloClientDialect : public Dialect {
 }  // namespace xla_chlo
 }  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_CHLO_OPS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_CHLO_OPS_H_
--- a/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
@ -15,8 +15,8 @@ limitations under the License.
 // This file defines the operations used in the XLA dialect.
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_OPS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringRef.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
@ -96,4 +96,4 @@ LogicalResult deriveShapeFromFirstOperand(
 }  // end namespace xla_hlo
 }  // end namespace mlir
-#endif  //  TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_OPS_H_
+#endif  //  TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
--- a/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h
+++ b/include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_INFER_FUSIBILITY_OP_INTERFACE_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_INFER_FUSIBILITY_OP_INTERFACE_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_INFER_FUSIBILITY_OP_INTERFACE_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_INFER_FUSIBILITY_OP_INTERFACE_H_
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/StandardTypes.h"
@ -25,4 +25,4 @@ namespace mlir {
 }  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_INFER_FUSIBILITY_OP_INTERFACE_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_INFER_FUSIBILITY_OP_INTERFACE_H_
--- a/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
+++ b/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
@ -15,8 +15,8 @@ limitations under the License.
 // This file defines the operations used in the LXLA dialect.
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_LHLO_OPS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_LHLO_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringRef.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
@ -49,4 +49,4 @@ class XlaLhloDialect : public Dialect {
 }  // namespace xla_lhlo
 }  // end namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_LHLO_OPS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
--- a/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
@ -0,0 +1,80 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H_
 #include <type_traits>
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 namespace mlir {
 namespace xla_hlo {
 template <typename HloOpTy>
 struct HloToLhloOpImpl {
  using Type = std::false_type;
 };
 template <typename HloOpTy>
 using HloToLhloOp = typename HloToLhloOpImpl<HloOpTy>::Type;
 #define MAP_HLO_TO_LHLO(OpName)             \
  template <>                               \
  struct HloToLhloOpImpl<xla_hlo::OpName> { \
    using Type = xla_lhlo::OpName;          \
  }
 MAP_HLO_TO_LHLO(AbsOp);
 MAP_HLO_TO_LHLO(AddOp);
 MAP_HLO_TO_LHLO(AndOp);
 MAP_HLO_TO_LHLO(BroadcastInDimOp);
 MAP_HLO_TO_LHLO(CeilOp);
 MAP_HLO_TO_LHLO(ConstOp);
 MAP_HLO_TO_LHLO(CompareOp);
 MAP_HLO_TO_LHLO(ComplexOp);
 MAP_HLO_TO_LHLO(ConvOp);
 MAP_HLO_TO_LHLO(ConvertOp);
 MAP_HLO_TO_LHLO(CopyOp);
 MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
 MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
 MAP_HLO_TO_LHLO(GatherOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);
 MAP_HLO_TO_LHLO(LogOp);
 MAP_HLO_TO_LHLO(MaxOp);
 MAP_HLO_TO_LHLO(MinOp);
 MAP_HLO_TO_LHLO(MulOp);
 MAP_HLO_TO_LHLO(NegOp);
 MAP_HLO_TO_LHLO(RealOp);
 MAP_HLO_TO_LHLO(ReduceOp);
 MAP_HLO_TO_LHLO(ReshapeOp);
 MAP_HLO_TO_LHLO(RemOp);
 MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);
 MAP_HLO_TO_LHLO(SignOp);
 MAP_HLO_TO_LHLO(SinOp);
 MAP_HLO_TO_LHLO(SqrtOp);
 MAP_HLO_TO_LHLO(SubOp);
 MAP_HLO_TO_LHLO(TanhOp);
 #undef MAP_HLO_TO_LHLO
 }  // namespace xla_hlo
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H_
--- a/include/mlir-hlo/Dialect/mhlo/transforms/map_xla_to_scalar_op.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/map_xla_to_scalar_op.h
@ -0,0 +1,510 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_XLA_TO_SCALAR_OP_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_XLA_TO_SCALAR_OP_H_
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringRef.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringSwitch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace impl {
 // A struct to map LhloBinaryOpTy type to the corresponding floating-point and
 // integer scalar operation types.
 template <typename LhloBinaryOpTy>
 struct LhloToScalarOp;
 template <>
 struct LhloToScalarOp<xla_lhlo::AddOp> {
  using FOp = ::mlir::AddFOp;
  using IOp = ::mlir::AddIOp;
 };
 template <>
 struct LhloToScalarOp<xla_lhlo::CompareOp> {
  using FOp = ::mlir::CmpFOp;
  using IOp = ::mlir::CmpIOp;
 };
 template <>
 struct LhloToScalarOp<xla_lhlo::DivOp> {
  using FOp = ::mlir::DivFOp;
  using IOp = ::mlir::SignedDivIOp;
 };
 template <>
 struct LhloToScalarOp<xla_lhlo::MulOp> {
  using FOp = ::mlir::MulFOp;
  using IOp = ::mlir::MulIOp;
 };
 template <>
 struct LhloToScalarOp<xla_lhlo::RemOp> {
  using FOp = ::mlir::RemFOp;
  using IOp = ::mlir::SignedRemIOp;
 };
 template <>
 struct LhloToScalarOp<xla_lhlo::SubOp> {
  using FOp = ::mlir::SubFOp;
  using IOp = ::mlir::SubIOp;
 };
 template <typename LhloBinaryOpTy>
 struct ScalarOp {
  using FOp = typename LhloToScalarOp<LhloBinaryOpTy>::FOp;
  using IOp = typename LhloToScalarOp<LhloBinaryOpTy>::IOp;
 };
 // Alias for the map from LHLO binary op type to STD floating-point op type.
 template <typename LhloOp>
 using ScalarFOp = typename ScalarOp<LhloOp>::FOp;
 // Alias for the map from LHLO binary op type to STD integer op type.
 template <typename LhloOp>
 using ScalarIOp = typename ScalarOp<LhloOp>::IOp;
 template <typename... Args>
 struct MapLhloOpToStdScalarOpImpl {
  Value operator()(Location loc, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b) {
    return nullptr;
  }
 };
 template <typename StdScalarOp>
 struct MapLhloOpToStdScalarOpImpl<StdScalarOp> {
  Value operator()(Location loc, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b) {
    return b->template create<StdScalarOp>(loc, result_types, args, mlir::None);
  }
 };
 template <typename SupportedType, typename StdScalarOp, typename... Args>
 struct MapLhloOpToStdScalarOpImpl<SupportedType, StdScalarOp, Args...> {
  Value operator()(Location loc, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b) {
    Type element_type = args.front().getType();
    if (element_type.isa<SupportedType>()) {
      return b->template create<StdScalarOp>(loc, result_types, args,
                                             mlir::None);
    }
    return MapLhloOpToStdScalarOpImpl<Args...>{}(loc, result_types, args, b);
  }
 };
 // Inserts the computation that corresponds to the body of the loop for lowered
 // LHLO unary/binary op. Returns the value for the result.
 template <typename LhloOpTy>
 inline Value MapLhloOpToStdScalarOp(Location loc, ArrayRef<Type> result_types,
                                    ArrayRef<Value> args, OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<IntegerType, ScalarIOp<LhloOpTy>, FloatType,
                                    ScalarFOp<LhloOpTy>>{}(loc, result_types,
                                                           args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::AbsOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  Type element_type = args.front().getType();
  if (element_type.isa<FloatType>()) {
    return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::AbsFOp>{}(
        loc, result_types, args, b);
  }
  if (element_type.isa<IntegerType>()) {
    // xla_lhlo.abs(x, result) ->  result = select((x > 0), x, sub(0, x))
    Value lhs = args[0];
    auto integer_type = element_type.dyn_cast<IntegerType>();
    auto zero_intval =
        b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
    auto lhs_gt_zero = b->create<ScalarIOp<CompareOp>>(loc, CmpIPredicate::sge,
                                                       lhs, zero_intval);
    auto neg_val = b->create<ScalarIOp<xla_lhlo::SubOp>>(loc, zero_intval, lhs);
    return b->create<::mlir::SelectOp>(loc, lhs_gt_zero, lhs, neg_val);
  }
  return nullptr;
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::AndOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<IntegerType, ::mlir::AndOp>{}(
      loc, result_types, args, b);
 }
 template <typename PredicateType>
 inline Optional<PredicateType> getCmpPredicate(
    StringRef xla_comparison_direction) {
  return llvm::None;
 }
 template <>
 inline Optional<CmpFPredicate> getCmpPredicate<CmpFPredicate>(
    StringRef xla_comparison_direction) {
  return llvm::StringSwitch<Optional<CmpFPredicate>>(xla_comparison_direction)
      .Case("EQ", CmpFPredicate::OEQ)
      .Case("NE", CmpFPredicate::ONE)
      .Case("GE", CmpFPredicate::OGE)
      .Case("GT", CmpFPredicate::OGT)
      .Case("LE", CmpFPredicate::OLE)
      .Case("LT", CmpFPredicate::OLT)
      .Default(llvm::None);
 }
 template <>
 inline Optional<CmpIPredicate> getCmpPredicate<CmpIPredicate>(
    StringRef xla_comparison_direction) {
  return llvm::StringSwitch<Optional<CmpIPredicate>>(xla_comparison_direction)
      .Case("EQ", CmpIPredicate::eq)
      .Case("NE", CmpIPredicate::ne)
      .Case("GE", CmpIPredicate::sge)
      .Case("GT", CmpIPredicate::sgt)
      .Case("LE", CmpIPredicate::sle)
      .Case("LT", CmpIPredicate::slt)
      .Default(llvm::None);
 }
 template <typename XLACompareOpTy>
 inline Value MapXlaCompareOpToStdScalarOp(Location loc,
                                          StringRef comparison_direction,
                                          ArrayRef<Type> result_types,
                                          ArrayRef<Value> args, OpBuilder* b) {
  const auto& lhs = args[0];
  const auto& rhs = args[1];
  Type element_type = lhs.getType();
  if (element_type.isSignlessInteger()) {
    Optional<CmpIPredicate> predicate =
        getCmpPredicate<CmpIPredicate>(comparison_direction);
    assert(predicate.hasValue() && "expected valid comparison direction");
    return b->create<ScalarIOp<XLACompareOpTy>>(loc, predicate.getValue(), lhs,
                                                rhs);
  }
  if (element_type.isa<FloatType>()) {
    Optional<CmpFPredicate> predicate =
        getCmpPredicate<CmpFPredicate>(comparison_direction);
    assert(predicate.hasValue() && "expected valid comparison direction");
    return b->create<ScalarFOp<XLACompareOpTy>>(loc, predicate.getValue(), lhs,
                                                rhs);
  }
  return nullptr;
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::CopyOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return args.front();
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ExpOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::ExpOp>{}(
      loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::CeilOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::CeilFOp>{}(
      loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ComplexOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<CreateComplexOp>{}(loc, result_types, args,
                                                       b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::RealOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<ReOp>{}(loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ImagOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<ImOp>{}(loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  Type sourceType = args.front().getType();
  Type targetType = result_types.front();
  if (mlir::SIToFPOp::areCastCompatible(sourceType, targetType)) {
    return b->create<mlir::SIToFPOp>(loc, result_types, args, mlir::None);
  } else if (sourceType.isa<FloatType>() && targetType.isa<FloatType>()) {
    FloatType src = sourceType.cast<FloatType>();
    FloatType res = targetType.cast<FloatType>();
    if (src.getWidth() > res.getWidth()) {
      return b->create<mlir::FPTruncOp>(loc, result_types, args, mlir::None);
    } else if (src.getWidth() < res.getWidth()) {
      return b->create<mlir::FPExtOp>(loc, result_types, args, mlir::None);
    }
    // No conversion is needed for the same width floats
    return args.front();
  }
  if (sourceType.isSignlessInteger() && targetType.isSignlessInteger()) {
    IntegerType src = sourceType.cast<IntegerType>();
    IntegerType res = targetType.cast<IntegerType>();
    if (src.getWidth() > res.getWidth()) {
      return b->create<mlir::TruncateIOp>(loc, result_types, args, mlir::None);
    } else if (src.getWidth() < res.getWidth()) {
      return b->create<mlir::ZeroExtendIOp>(loc, result_types, args,
                                            mlir::None);
    }
    // No conversion is needed for the same width integers
    return args.front();
  }
  if (mlir::FPToSIOp::areCastCompatible(sourceType, targetType)) {
    return b->create<mlir::FPToSIOp>(loc, result_types, args, mlir::None);
  }
  return nullptr;
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::DotOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  // Dot Op converter from lhlo to affine only accepts float and integer types.
  const auto& lhs = args[0];
  const auto& rhs = args[1];
  const auto& result = args[2];
  Type element_type = lhs.getType();
  if (element_type.isa<FloatType>()) {
    Value float_mul = MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::MulFOp>{}(
        loc, result_types, {lhs, rhs}, b);
    return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::AddFOp>{}(
        loc, result_types, {float_mul, result}, b);
  }
  if (element_type.isa<IntegerType>()) {
    Value int_mul = MapLhloOpToStdScalarOpImpl<IntegerType, ::mlir::MulIOp>{}(
        loc, result_types, {lhs, rhs}, b);
    return MapLhloOpToStdScalarOpImpl<IntegerType, ::mlir::AddIOp>{}(
        loc, result_types, {int_mul, result}, b);
  }
  return nullptr;
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::CosOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::CosOp>{}(
      loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::SinOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::SinOp>{}(
      loc, result_types, args, b);
 }
 /// Implements the conversion of XLA op to scalar op (to use within region of a
 /// linalg.generic op) for compare-select style operations like min/max.
 template <typename... Args>
 struct XlaCompareSelectOpToStdScalarOp {
  static Value map(Location loc, StringRef comparison_direction,
                   ArrayRef<Type> result_types, ArrayRef<Value> args,
                   OpBuilder* b) {
    return nullptr;
  }
 };
 /// Specialization which allows converting to a comparison operation in standard
 /// dialect with a given predicate based on the element type of the operand.
 template <typename SupportedType, typename StdCompareOp, typename Predicate,
          typename... Args>
 struct XlaCompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
                                       Args...> {
  static Value map(Location loc, StringRef comparison_direction,
                   ArrayRef<Type> result_types, ArrayRef<Value> args,
                   OpBuilder* b) {
    Type element_type = args.front().getType();
    if (element_type.isa<SupportedType>()) {
      auto predicate = getCmpPredicate<Predicate>(comparison_direction);
      assert(predicate.hasValue() && "expected valid comparison direction");
      auto cmp = b->template create<StdCompareOp>(loc, predicate.getValue(),
                                                  args[0], args[1]);
      return b->create<::mlir::SelectOp>(loc, cmp, args[0], args[1]);
    }
    return XlaCompareSelectOpToStdScalarOp<Args...>::map(
        loc, comparison_direction, result_types, args, b);
  }
 };
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::LogOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::LogOp>{}(
      loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::MaxOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return XlaCompareSelectOpToStdScalarOp<
      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>::map(loc, "GT",
                                                          result_types, args,
                                                          b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::MinOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return XlaCompareSelectOpToStdScalarOp<
      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>::map(loc, "LT",
                                                          result_types, args,
                                                          b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::NegOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  Type element_type = args.front().getType();
  if (element_type.isa<FloatType>()) {
    return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::NegFOp>{}(
        loc, result_types, args, b);
  }
  if (element_type.isa<IntegerType>()) {
    // xla_lhlo.neg(x, result) -> result = sub(0, x)
    Value lhs = args[0];
    auto integer_type = element_type.dyn_cast<IntegerType>();
    auto zero_intval =
        b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
    return b->create<ScalarIOp<xla_lhlo::SubOp>>(loc, zero_intval, lhs);
  }
  return nullptr;
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::RsqrtOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::RsqrtOp>{}(
      loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::SelectOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<::mlir::SelectOp>{}(loc, result_types, args,
                                                        b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::SignOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  Type element_type = args.front().getType();
  if (element_type.isa<FloatType>()) {
    FloatType float_type = element_type.cast<FloatType>();
    APFloat const_value = float_type.isF32() ? APFloat(1.0f) : APFloat(1.0);
    Value one = b->create<mlir::ConstantFloatOp>(loc, const_value, float_type);
    return b->create<::mlir::CopySignOp>(loc, result_types, one, args[0]);
  }
  return nullptr;
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::SqrtOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::SqrtOp>{}(
      loc, result_types, args, b);
 }
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::TanhOp>(
    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
    OpBuilder* b) {
  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::TanhOp>{}(
      loc, result_types, args, b);
 }
 }  // namespace impl
 struct XlaOpToStdScalarOp {
  // Implementation for LHLO ops except xla_lhlo::CompareOp.
  template <typename XlaOpTy, typename LhloOpTy = XlaOpTy,
            typename = std::enable_if_t<
                !std::is_same<LhloOpTy, xla_lhlo::CompareOp>::value &&
                std::is_same<typename xla_hlo::HloToLhloOp<LhloOpTy>,
                             std::false_type>::value>>
  static Value map(XlaOpTy op, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b, unsigned i = 0) {
    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(op.getLoc(), result_types,
                                                  args, b);
  }
  // Implementation for HLO ops except xla_hlo::CompareOp.
  template <typename XlaOpTy, typename LhloOpTy = xla_hlo::HloToLhloOp<XlaOpTy>,
            typename = std::enable_if_t<
                !std::is_same<LhloOpTy, xla_lhlo::CompareOp>::value &&
                !std::is_same<LhloOpTy, std::false_type>::value>>
  static Value map(XlaOpTy op, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b, int i = 0) {
    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(op.getLoc(), result_types,
                                                  args, b);
  }
  // Implementation for xla_lhlo::CompareOp.
  template <typename LhloOpTy, typename = std::enable_if_t<std::is_same<
                                   LhloOpTy, xla_lhlo::CompareOp>::value>>
  static Value map(xla_lhlo::CompareOp op, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b) {
    auto comparison_direction = op.comparison_direction();
    return impl::MapXlaCompareOpToStdScalarOp<xla_lhlo::CompareOp>(
        op.getLoc(), comparison_direction, result_types, args, b);
  }
  // Implementation for xla_hlo::CompareOp.
  template <typename HloOpTy, typename = std::enable_if_t<std::is_same<
                                  HloOpTy, xla_hlo::CompareOp>::value>>
  static Value map(xla_hlo::CompareOp op, ArrayRef<Type> result_types,
                   ArrayRef<Value> args, OpBuilder* b) {
    auto comparison_direction = op.comparison_direction();
    return impl::MapXlaCompareOpToStdScalarOp<xla_lhlo::CompareOp>(
        op.getLoc(), comparison_direction, result_types, args, b);
  }
 };
 }  // namespace xla_lhlo
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_XLA_TO_SCALAR_OP_H_
--- a/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
@ -0,0 +1,105 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H_
 #include <memory>
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/ArrayRef.h"
 namespace mlir {
 class FuncOp;
 class ModuleOp;
 class Operation;
 template <typename T>
 class OperationPass;
 class Pass;
 namespace xla_hlo {
 /// Lowers HLO control flow ops to the Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
 /// Lowers from HLO dialect to Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToStdPass();
 /// Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
 /// buffers if necessary. If `results_escape_functions` is set to true,
 /// allocated buffers for function results will be returned and escape the
 /// function. Otherwise, the signature is rewritten with extra arguments for the
 /// buffers that are to be used for results.
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
    bool results_escape_functions = false);
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 // Transforms unranked HLO operations to ranked ones where possible.
 std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass();
 // Sinks constants implicitly captured in control flow regions. This is
 // necessary to export to XLA.
 std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
 // fuse xla_hlo ops to kLoop/kInput fusion patterns
 std::unique_ptr<OperationPass<FuncOp>> createXlaHloFusionPass();
 }  // namespace xla_hlo
 namespace xla_lhlo {
 // Lowers from LHLO dialect to Affine dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToAffinePass();
 // Lowers from LHLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToLinalgPass();
 // Lowers from LHLO dialect to GPU dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
 // Fuses linalg ops obtained after LHLO lowering. To enable fusion,
 // operations are first tiled.
 //
 // When 'use_parallel_loops' is set, the tiling will use scf.parallel
 // operations. Otherwise, scf.for operations are used.
 //
 // 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
 // operation has more dimensions than tile sizes provided, 1 is used as
 // default.
 std::unique_ptr<OperationPass<FuncOp>> createLhloFuseLinalg(
    bool use_parallel_loops = false, llvm::ArrayRef<unsigned> tile_sizes = {});
 // Removes unnecessary LHLO copies which copy from the allocated buffers to the
 // block arguments. The block arguments are used instead of all uses of these
 // buffers. The buffers are freed. This pass only works in regions that contain
 // a single block.
 std::unique_ptr<Pass> createLhloCopyRemovalPass();
 // Lowers from LHLO dialect to parallel loops.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 }  // namespace xla_lhlo
 namespace xla {
 /// Lowers the standard TanhOp to an approximation that does not use intrinsics.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTanhToApproximationPass();
 }  // namespace xla
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H_
--- a/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
@ -0,0 +1,106 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H_
 #include <memory>
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 namespace mlir {
 class LLVMTypeConverter;
 class LowerToLLVMOptions;
 class OwningRewritePatternList;
 class BufferAssignmentPlacer;
 namespace xla_hlo {
 // Collection of rewrite patterns for lowering a general dot product.
 void PopulateGeneralDotOpLoweringPatterns(OwningRewritePatternList *patterns,
                                          MLIRContext *ctx);
 // Collection of rewrite patterns for lowering complex operations to equivalent
 // float operations.
 void PopulateComplexLoweringPatterns(MLIRContext *context,
                                     OwningRewritePatternList *patterns);
 void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
                              MLIRContext *ctx);
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
 void populateHLOToLHLOConversionPattern(
    MLIRContext *context, BufferAssignmentPlacer *bufferAssignment,
    TypeConverter *converter, OwningRewritePatternList *patterns);
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
                                          OwningRewritePatternList *patterns);
 // Sets up legality definitions for materializing broadcasts.
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                        ConversionTarget *conversionTarget);
 // Populates a collection of rewrite patterns for materializing broadcast
 // attributes to equivalent sequences of ops.
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns);
 // Sets up legality definitions for element-wise operations on ranked tensors.
 void SetupTransformUnrankedHloLegality(MLIRContext *context,
                                       ConversionTarget *conversionTarget);
 // Populates a collection of rewrite patterns to realize element-wise operations
 // on ranked tensors where possible.
 void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
                                          OwningRewritePatternList *patterns);
 // Populate a collection of conversion patterns for un-fusing
 // batch_norm_inference and batch_norm_training into constituent HLO ops.
 // TODO(laurenzo): Implement un-fusing of batch_norm_training.
 void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
                                     OwningRewritePatternList *patterns);
 }  // namespace xla_hlo
 namespace xla_lhlo {
 /// Collect a set of patterns to convert from the LHLO dialect to LLVM.
 void PopulateLhloToLLVMConversionPatterns(const LowerToLLVMOptions &options,
                                          LLVMTypeConverter *converter,
                                          OwningRewritePatternList *patterns);
 }  // namespace xla_lhlo
 namespace xla_chlo {
 // Populates a collection of conversion patterns for legalizing client-HLO to
 // HLO.
 void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
                                       OwningRewritePatternList *patterns);
 }  // namespace xla_chlo
 namespace xla {
 // Populates a pattern that translates the standard TanhOp to an approximation
 // that does not use intrinsics.
 void PopulateTanhToApproximationPatterns(MLIRContext *context,
                                         OwningRewritePatternList *patterns);
 }  // namespace xla
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H_
--- a/include/mlir-hlo/utils/cycle_detector.h
+++ b/include/mlir-hlo/utils/cycle_detector.h
@ -0,0 +1,165 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_CYCLE_DETECTOR_H_
 #define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_CYCLE_DETECTOR_H_
 #include <vector>
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/DenseMap.h"
 namespace mlir {
 // -------------------------------------------------------------------
 // This file contains a light version of GraphCycles implemented in
 // tensorflow/compiler/jit/graphcycles/graphcycles.h
 //
 // We re-implement it here because we do not want to rely
 // on TensorFlow data structures, and hence we can move
 // corresponding passes to llvm repo. easily in case necessnary.
 // --------------------------------------------------------------------
 // This is a set data structure that provides a deterministic iteration order.
 // The iteration order of elements only depends on the sequence of
 // inserts/deletes, so as long as the inserts/deletes happen in the same
 // sequence, the set will have the same iteration order.
 //
 // Assumes that T can be cheaply copied for simplicity.
 template <typename T>
 class OrderedSet {
 public:
  // Inserts `value` into the ordered set.  Returns true if the value was not
  // present in the set before the insertion.
  bool Insert(T value) {
    bool new_insertion =
        value_to_index_.insert({value, value_sequence_.size()}).second;
    if (new_insertion) {
      value_sequence_.push_back(value);
    }
    return new_insertion;
  }
  // Removes `value` from the set.  Assumes `value` is already present in the
  // set.
  void Erase(T value) {
    auto it = value_to_index_.find(value);
    // Since we don't want to move values around in `value_sequence_` we swap
    // the value in the last position and with value to be deleted and then
    // pop_back.
    value_to_index_[value_sequence_.back()] = it->second;
    std::swap(value_sequence_[it->second], value_sequence_.back());
    value_sequence_.pop_back();
    value_to_index_.erase(it);
  }
  void Reserve(size_t new_size) {
    value_to_index_.reserve(new_size);
    value_sequence_.reserve(new_size);
  }
  void Clear() {
    value_to_index_.clear();
    value_sequence_.clear();
  }
  bool Contains(T value) const { return value_to_index_.count(value); }
  size_t Size() const { return value_sequence_.size(); }
  const std::vector<T>& GetSequence() const { return value_sequence_; }
 private:
  // The stable order that we maintain through insertions and deletions.
  std::vector<T> value_sequence_;
  // Maps values to their indices in `value_sequence_`.
  llvm::DenseMap<T, int> value_to_index_;
 };
 // ---------------------------------------------------------------------
 // GraphCycles detects the introduction of a cycle into a directed
 // graph that is being built up incrementally.
 //
 // Nodes are identified by small integers.  It is not possible to
 // record multiple edges with the same (source, destination) pair;
 // requests to add an edge where one already exists are silently
 // ignored.
 //
 // It is also not possible to introduce a cycle; an attempt to insert
 // an edge that would introduce a cycle fails and returns false.
 //
 // GraphCycles uses no internal locking; calls into it should be
 // serialized externally.
 // Performance considerations:
 //   Works well on sparse graphs, poorly on dense graphs.
 //   Extra information is maintained incrementally to detect cycles quickly.
 //   InsertEdge() is very fast when the edge already exists, and reasonably fast
 //   otherwise.
 //   FindPath() is linear in the size of the graph.
 // The current implementation uses O(|V|+|E|) space.
 class GraphCycles {
 public:
  explicit GraphCycles(int32_t num_nodes);
  ~GraphCycles();
  // Attempt to insert an edge from x to y.  If the
  // edge would introduce a cycle, return false without making any
  // changes. Otherwise add the edge and return true.
  bool InsertEdge(int32_t x, int32_t y);
  // Remove any edge that exists from x to y.
  void RemoveEdge(int32_t x, int32_t y);
  // Return whether there is an edge directly from x to y.
  bool HasEdge(int32_t x, int32_t y) const;
  // Contracts the edge from 'a' to node 'b', merging nodes 'a' and 'b'. One of
  // the nodes is removed from the graph, and edges to/from it are added to
  // the remaining one, which is returned. If contracting the edge would create
  // a cycle, does nothing and return no value.
  llvm::Optional<int32_t> ContractEdge(int32_t a, int32_t b);
  // Return whether dest_node `y` is reachable from source_node `x`
  // by following edges. This is non-thread-safe version.
  bool IsReachable(int32_t x, int32_t y);
  // Return a copy of the successors set. This is needed for code using the
  // collection while modifying the GraphCycles.
  std::vector<int32_t> SuccessorsCopy(int32_t node) const;
  // Returns all nodes in post order.
  //
  // If there is a path from X to Y then X appears after Y in the
  // returned vector.
  std::vector<int32_t> AllNodesInPostOrder() const;
  // ----------------------------------------------------
  struct Rep;
 private:
  GraphCycles(const GraphCycles&) = delete;
  GraphCycles& operator=(const GraphCycles&) = delete;
  Rep* rep_;  // opaque representation
 };
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_CYCLE_DETECTOR_H_
--- a/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
+++ b/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
@ -0,0 +1,242 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Shape/IR/Shape.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/OperationSupport.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/broadcast_utils.h"
 namespace mlir {
 namespace xla_chlo {
 namespace {
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding xla_hlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
  LogicalResult matchAndRewrite(ChloOpTy op,
                                PatternRewriter &rewriter) const override {
    // Only rewrite for statically determinable non-broadcasting cases.
    auto lhs_type = op.lhs().getType().template dyn_cast<RankedTensorType>();
    auto rhs_type = op.rhs().getType().template dyn_cast<RankedTensorType>();
    if (!lhs_type || !rhs_type) return failure();
    // Requires rank broadcast.
    if (lhs_type.getRank() != rhs_type.getRank()) return failure();
    // Any dynamic dimension may require broadcasting and requires more
    // analysis.
    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape())
      return failure();
    for (auto extents : llvm::zip(lhs_type.getShape(), rhs_type.getShape())) {
      auto lhs_extent = std::get<0>(extents);
      auto rhs_extent = std::get<1>(extents);
      if (lhs_extent != rhs_extent) {
        return failure();
      }
    }
    rewriter.replaceOp(op, {Adaptor::CreateOp(op, op.getResult().getType(),
                                              op.lhs(), op.rhs(), rewriter)});
    return success();
  }
 };
 // Converts a binary op with ranked broadcasting operands to explicitly
 // broadcast and invoke the corresponding xla_hlo non-broadcasting op.
 // Note that dynamic broadcasting supported by this pattern is only valid for
 // "numpy" broadcasting semantics as defined here:
 //   https://docs.scipy.org/doc/numpy/reference/ufuncs.html
 // Specifically, this includes the following cases:
 //   - Same rank broadcast (operands have the same static rank).
 //   - Different-rank broadcast, either without a broadcast_dims attribte or
 //     with the broadcast_dims attribute set to map to a prefix padding.
 //   - Legal combinations of degenerate (1-dim) implicit broadcasting.
 // The restriction on broadcast_dims derives from the definition of the
 // `shape.broadcast` op, which only supports prefix-padding.
 //
 // It may be possible to expand this pattern to operate on unranked tensors in
 // the future by emitting more code to dynamically differentiate based on rank.
 // Whether that is of any practical benefit remains to be seen.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertRankedDynamicBroadcastBinaryOp
    : public OpRewritePattern<ChloOpTy> {
  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
  LogicalResult matchAndRewrite(ChloOpTy op,
                                PatternRewriter &rewriter) const override {
    // Only support ranked operands.
    Value lhs = op.lhs();
    Value rhs = op.rhs();
    auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
    auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
    auto result_type =
        op.getResult().getType().template dyn_cast<RankedTensorType>();
    if (!lhs_type || !rhs_type || !result_type) return failure();
    // Check for "numpy"-style rank broadcast.
    auto broadcast_dimensions = op.broadcast_dimensions();
    if (broadcast_dimensions &&
        !xla::IsLegalNumpyRankedBroadcast(lhs, rhs, *broadcast_dimensions)) {
      // Note: It is unclear whether the general specification of explicit
      // broadcast_dimensions on binary ops is a feature we want to carry
      // forward. While it can technically be implemented for ranked-dynamic,
      // it is incompatible with unranked inputs. If this warning is emitted
      // in real programs, it is an indication that the feature should be
      // implemented versus just falling back on the more standard definition
      // of numpy-like prefix-padding.
      op.emitWarning() << "unsupported non prefix-padded dynamic rank "
                       << "broadcast_dimensions = " << *broadcast_dimensions;
      return failure();
    }
    // Compute result shape.
    auto loc = op.getLoc();
    // Insert a constraint on the shapes being broadcastable and insert all
    // future code into an assuming block reliant on the constraint.
    Value lhs_shape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
    Value rhs_shape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
    auto broadcastable_cstr =
        rewriter.create<shape::CstrBroadcastableOp>(loc, lhs_shape, rhs_shape);
    auto assuming_op = rewriter.create<shape::AssumingOp>(
        loc, ArrayRef<Type>{result_type}, broadcastable_cstr.result());
    OpBuilder::InsertionGuard guard(rewriter);
    rewriter.createBlock(&assuming_op.doRegion());
    int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
    Value result_extents =
        xla::ComputeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
                                                               rewriter);
    // Note that we unconditionally emit DynamicBroadcastInDim ops and let
    // downstream canonicalizations fold them away if possible. This is
    // because, in the dynamic case, there are many corner cases regarding
    // when it is safe to omit, and some of them require analysis to prove
    // properly.
    auto lhs_broadcast_dimensions = llvm::to_vector<4>(
        llvm::seq<int64_t>(result_rank - lhs_type.getRank(), result_rank));
    Value broadcasted_lhs = rewriter.create<xla_hlo::DynamicBroadcastInDimOp>(
        loc,
        RankedTensorType::get(result_type.getShape(),
                              lhs_type.getElementType()),
        lhs, result_extents,
        rewriter.getI64TensorAttr(lhs_broadcast_dimensions));
    auto rhs_broadcast_dimensions = llvm::to_vector<4>(
        llvm::seq<int64_t>(result_rank - rhs_type.getRank(), result_rank));
    Value broadcasted_rhs = rewriter.create<xla_hlo::DynamicBroadcastInDimOp>(
        loc,
        RankedTensorType::get(result_type.getShape(),
                              rhs_type.getElementType()),
        rhs, result_extents,
        rewriter.getI64TensorAttr(rhs_broadcast_dimensions));
    // And generate the final non-broadcasted binary op.
    Value final_result = Adaptor::CreateOp(op, result_type, broadcasted_lhs,
                                           broadcasted_rhs, rewriter);
    rewriter.create<shape::AssumingYieldOp>(loc, final_result);
    rewriter.replaceOp(op, {assuming_op.getResult(0)});
    return success();
  }
 };
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 void PopulateForBinaryOp(MLIRContext *context,
                         OwningRewritePatternList *patterns) {
  patterns
      ->insert<ConvertTrivialNonBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
          context, 10);
  patterns->insert<
      ConvertRankedDynamicBroadcastBinaryOp<ChloOpTy, HloOpTy, Adaptor>>(
      context, 5);
 }
 template <typename FromOpTy, typename ToOpTy>
 struct HloBinaryElementwiseAdaptor {
  static ToOpTy CreateOp(FromOpTy from_op, Type result_type,
                         Value broadcasted_lhs, Value broadcasted_rhs,
                         OpBuilder &builder) {
    return builder.create<ToOpTy>(from_op.getLoc(), result_type,
                                  broadcasted_lhs, broadcasted_rhs);
  }
 };
 struct HloComplexAdaptor {
  static xla_hlo::ComplexOp CreateOp(BroadcastComplexOp from_op,
                                     Type result_type, Value broadcasted_lhs,
                                     Value broadcasted_rhs,
                                     OpBuilder &builder) {
    return builder.create<xla_hlo::ComplexOp>(from_op.getLoc(), result_type,
                                              broadcasted_lhs, broadcasted_rhs);
  }
 };
 struct HloCompareAdaptor {
  static xla_hlo::CompareOp CreateOp(BroadcastCompareOp from_op,
                                     Type result_type, Value broadcasted_lhs,
                                     Value broadcasted_rhs,
                                     OpBuilder &builder) {
    return builder.create<xla_hlo::CompareOp>(from_op.getLoc(), result_type,
                                              broadcasted_lhs, broadcasted_rhs,
                                              from_op.comparison_direction());
  }
 };
 }  // namespace
 void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
                                       OwningRewritePatternList *patterns) {
  // Instantiate conversion templates for conforming binary elementwise ops
  // that do not have different dtypes between operands and results and do
  // not have special attributes that need to be preserved.
 #define POPULATE_BCAST(ChloOp, HloOp)                                      \
  PopulateForBinaryOp<ChloOp, HloOp,                                       \
                      HloBinaryElementwiseAdaptor<ChloOp, HloOp>>(context, \
                                                                  patterns);
  POPULATE_BCAST(BroadcastAddOp, xla_hlo::AddOp);
  POPULATE_BCAST(BroadcastAndOp, xla_hlo::AndOp);
  POPULATE_BCAST(BroadcastAtan2Op, xla_hlo::Atan2Op);
  POPULATE_BCAST(BroadcastDivOp, xla_hlo::DivOp);
  POPULATE_BCAST(BroadcastMaxOp, xla_hlo::MaxOp);
  POPULATE_BCAST(BroadcastMinOp, xla_hlo::MinOp);
  POPULATE_BCAST(BroadcastMulOp, xla_hlo::MulOp);
  POPULATE_BCAST(BroadcastOrOp, xla_hlo::OrOp);
  POPULATE_BCAST(BroadcastPowOp, xla_hlo::PowOp);
  POPULATE_BCAST(BroadcastRemOp, xla_hlo::RemOp);
  POPULATE_BCAST(BroadcastShiftLeftOp, xla_hlo::ShiftLeftOp);
  POPULATE_BCAST(BroadcastShiftRightArithmeticOp,
                 xla_hlo::ShiftRightArithmeticOp);
  POPULATE_BCAST(BroadcastShiftRightLogicalOp, xla_hlo::ShiftRightLogicalOp);
  POPULATE_BCAST(BroadcastSubOp, xla_hlo::SubOp);
  POPULATE_BCAST(BroadcastXorOp, xla_hlo::XorOp);
  // Broadcasting ops requiring special construction.
  PopulateForBinaryOp<BroadcastComplexOp, xla_hlo::ComplexOp,
                      HloComplexAdaptor>(context, patterns);
  PopulateForBinaryOp<BroadcastCompareOp, xla_hlo::CompareOp,
                      HloCompareAdaptor>(context, patterns);
 }
 }  // namespace xla_chlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
+++ b/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
@ -0,0 +1,57 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Shape/IR/Shape.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla_chlo {
 namespace {
 struct TestChloLegalizeToHloPass
    : public PassWrapper<TestChloLegalizeToHloPass, FunctionPass> {
  void runOnFunction() override {
    ConversionTarget conversionTarget(getContext());
    OwningRewritePatternList conversionPatterns;
    conversionTarget.addIllegalDialect<XlaHloClientDialect>();
    // Consider the xla_hlo dialect legal for tests.
    conversionTarget.addLegalDialect<xla_hlo::XlaHloDialect>();
    // The conversion uses helpers from the Standard dialect.
    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
    conversionTarget.addLegalDialect<mlir::shape::ShapeDialect>();
    PopulateLegalizeChloToHloPatterns(&getContext(), &conversionPatterns);
    if (failed(applyPartialConversion(getFunction(), conversionTarget,
                                      conversionPatterns))) {
      return signalPassFailure();
    }
  }
 };
 }  // namespace
 }  // namespace xla_chlo
 }  // namespace mlir
 static mlir::PassRegistration<mlir::xla_chlo::TestChloLegalizeToHloPass> pass(
    "test-xla-chlo-legalize-to-hlo",
    "Test pass for applying chlo -> hlo legalization patterns");
--- a/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
@ -0,0 +1,493 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering HLO dialect to LHLO dialect.
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/AffineMap.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/BlockAndValueMapping.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Builders.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Location.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/BufferPlacement.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 template <typename T>
 using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
 using StdReturnOpConverter =
    detail::BufferAssignmentReturnOpConverter<mlir::ReturnOp, mlir::ReturnOp,
                                              xla_lhlo::CopyOp, true>;
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                   Value shape_operand,
                                   ConversionPatternRewriter* rewriter) {
  auto result_type = result.getType().dyn_cast<ShapedType>();
  if (!result_type) {
    result.getDefiningOp()->emitOpError()
        << "tensor to buffer conversion expects ranked results";
  }
  auto memref_type =
      MemRefType::get(result_type.getShape(), result_type.getElementType());
  Operation* op = result.getDefiningOp();
  // Extract the required element out of the vector.
  SmallVector<Value, 4> dynamic_operands;
  for (auto shape_element : llvm::enumerate(result_type.getShape())) {
    if (shape_element.value() != ShapedType::kDynamicSize) continue;
    Value index = rewriter->create<ConstantOp>(
        loc, rewriter->getIntegerAttr(rewriter->getIndexType(),
                                      shape_element.index()));
    Value alloc_operand = rewriter->create<ExtractElementOp>(loc, shape_operand,
                                                             ValueRange{index});
    if (!alloc_operand.getType().isIndex()) {
      alloc_operand = rewriter->create<IndexCastOp>(loc, alloc_operand,
                                                    rewriter->getIndexType());
    }
    dynamic_operands.push_back(alloc_operand);
  }
  // Insert in front of op to ensure sizes are available.
  OpBuilder allocBuilder(op);
  auto alloc = allocBuilder.create<AllocOp>(loc, memref_type, dynamic_operands);
  return alloc;
 }
 Value InsertAlloc(Location loc, OpResult result,
                  BufferAssignmentPlacer* bufferAssignment,
                  ConversionPatternRewriter* rewriter) {
  auto result_type = result.getType().dyn_cast<ShapedType>();
  if (!result_type || !result_type.hasStaticShape()) {
    result.getDefiningOp()->emitOpError()
        << "tensor to buffer conversion expects statically shaped results";
  }
  auto memref_type =
      MemRefType::get(result_type.getShape(), result_type.getElementType());
  OpBuilder::InsertionGuard guard(*rewriter);
  rewriter->restoreInsertionPoint(
      bufferAssignment->computeAllocPosition(result));
  auto alloc = rewriter->create<AllocOp>(loc, memref_type);
  return alloc;
 }
 template <typename HloOpTy>
 class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
 public:
  using BaseOpConversion<HloOpTy>::BaseOpConversion;
  LogicalResult matchAndRewrite(
      HloOpTy hloOp, ArrayRef<Value> operands,
      ConversionPatternRewriter& rewriter) const final {
    Operation* op = hloOp.getOperation();
    const auto& original_results = op->getResults();
    SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
    for (auto result : llvm::enumerate(original_results)) {
      RankedTensorType resultType =
          result.value().getType().dyn_cast<RankedTensorType>();
      if (!resultType) {
        return failure();
      }
      if (resultType.hasStaticShape()) {
        buffer_args.push_back(InsertAlloc(op->getLoc(), result.value(),
                                          this->bufferAssignment, &rewriter));
      } else {
        SmallVector<Value, 1> results_shape;
        auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
        if (!shape_type_op) return failure();
        if (failed(
                shape_type_op.reifyReturnTypeShapes(rewriter, results_shape)))
          return failure();
        buffer_args.push_back(InsertDynamicAllocAndDealloc(
            op->getLoc(), result.value(), results_shape.front(), &rewriter));
      }
    }
    rewriter.create<xla_hlo::HloToLhloOp<HloOpTy>>(op->getLoc(), llvm::None,
                                                   buffer_args, op->getAttrs());
    rewriter.replaceOp(op, ArrayRef<Value>(buffer_args).slice(operands.size()));
    return success();
  }
 };
 struct HloToLhloDynamicBroadcastInDimOpConverter
    : public BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp> {
 public:
  using BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp>::BaseOpConversion;
  LogicalResult matchAndRewrite(
      xla_hlo::DynamicBroadcastInDimOp op, ArrayRef<Value> operands,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = op.getLoc();
    Value resultBuffer = InsertDynamicAllocAndDealloc(
        loc, op.getResult(), op.output_dimensions(), &rewriter);
    Value transformed_operand =
        InsertDynamicMemrefCastOp(op, operands.front(), &rewriter);
    rewriter.create<xla_lhlo::BroadcastInDimOp>(
        loc, transformed_operand, resultBuffer, op.broadcast_dimensions());
    rewriter.replaceOp(op, {resultBuffer});
    return success();
  }
 private:
  // Inserts dynamic memref to change the layout of the memref to put 0-stride
  // and size of the target dimension if size-1 dimension expansion is
  // necessary.
  xla_lhlo::DynamicMemRefCastOp InsertDynamicMemrefCastOp(
      xla_hlo::DynamicBroadcastInDimOp op, Value operand, OpBuilder* b) const {
    auto loc = op.getLoc();
    auto operand_type = operand.getType().cast<MemRefType>();
    auto operand_shape = operand_type.getShape();
    SmallVector<Value, 2> sizes, strides;
    sizes.reserve(operand_shape.size());
    strides.reserve(operand_shape.size());
    Value zero = b->create<ConstantIndexOp>(loc, 0);
    Value one = b->create<ConstantIndexOp>(loc, 1);
    for (auto dim : llvm::enumerate(op.broadcast_dimensions())) {
      Value broadcast_dim_value =
          b->create<ConstantIndexOp>(loc, dim.value().getSExtValue());
      Value result_dim_size = b->create<ExtractElementOp>(
          loc, op.output_dimensions(), broadcast_dim_value);
      Value operand_dim_size =
          ShapedType::isDynamic(operand_shape[dim.index()])
              ? b->create<DimOp>(loc, operand, dim.index()).getResult()
              : b->create<ConstantIndexOp>(loc, operand_shape[dim.index()])
                    .getResult();
      // TODO(pifon): Revisit if this cast is needed. Maybe we can use
      // tensor<index> for `output_dimensions` as well.
      if (!result_dim_size.getType().isIndex()) {
        result_dim_size =
            b->create<IndexCastOp>(loc, result_dim_size, b->getIndexType());
      }
      // There can be two cases:
      // 1) Operand dim == result dim => expansion is not needed => stride := 1.
      // 2) Operand dim < result dim => expansion is needed => stride := 0.
      Value is_expansion = b->create<CmpIOp>(loc, CmpIPredicate::slt,
                                             operand_dim_size, result_dim_size);
      strides.push_back(
          b->create<mlir::SelectOp>(loc, is_expansion, zero, one));
      // Size of input dim can be set to the size of the corresponding output
      // dimension for both cases.
      sizes.push_back(result_dim_size);
    }
    // Type-erased memref type with static rank, dynamic sizes and strides.
    SmallVector<int64_t, 2> dynamic_layout(operand_shape.size(),
                                           MemRefType::kDynamicStrideOrOffset);
    SmallVector<int64_t, 2> dynamic_shape(operand_shape.size(),
                                          MemRefType::kDynamicSize);
    auto type_erased_memref_type = MemRefType::get(
        dynamic_shape, operand_type.getElementType(),
        makeStridedLinearLayoutMap(dynamic_layout,
                                   /*offset=*/0, b->getContext()));
    auto transformed_operand = b->create<xla_lhlo::DynamicMemRefCastOp>(
        loc, type_erased_memref_type, operand, sizes, strides);
    return transformed_operand;
  }
 };
 struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
 public:
  using BaseOpConversion<xla_hlo::ReduceOp>::BaseOpConversion;
  LogicalResult matchAndRewrite(
      xla_hlo::ReduceOp op, ArrayRef<Value> operands,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = op.getLoc();
    // TODO(b/137624192) Implement variadic reduce.
    if (op.getNumResults() != 1) return failure();
    if (!llvm::hasSingleElement(op.body())) {
      return op.emitOpError()
             << "tensor to buffer conversion expects a single block "
                "in the region containing the operation";
    }
    const auto& original_results = op.getResults();
    SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
    for (auto result : original_results) {
      buffer_args.push_back(
          InsertAlloc(loc, result, this->bufferAssignment, &rewriter));
    }
    auto new_op = rewriter.create<xla_lhlo::ReduceOp>(
        loc, llvm::None, buffer_args, op.getAttrs());
    // Copy over the operations inside the region.
    rewriter.inlineRegionBefore(op.body(), new_op.body(), new_op.body().end());
    // Create new block arguments with correct type.
    auto& entry_block = new_op.body().front();
    int original_arg_count = entry_block.getNumArguments();
    for (int i = 0; i < original_arg_count; ++i) {
      auto old_arg = entry_block.getArgument(i);
      auto old_type = old_arg.getType().cast<TensorType>();
      auto new_type =
          MemRefType::get(old_type.getShape(), old_type.getElementType());
      auto new_arg = entry_block.addArgument(new_type);
      rewriter.replaceUsesOfBlockArgument(old_arg, new_arg);
    }
    // Add an argument for the result.
    entry_block.addArgument(
        entry_block.getArgument(original_arg_count).getType());
    // Remove the old arguments.
    for (int i = original_arg_count - 1; i >= 0; --i) {
      entry_block.eraseArgument(i);
    }
    // Insert terminator at the end.
    rewriter.setInsertionPointToEnd(&entry_block);
    rewriter.create<xla_lhlo::TerminatorOp>(loc);
    rewriter.replaceOp(op, ArrayRef<Value>(buffer_args).slice(operands.size()));
    return success();
  }
 };
 class HloToLhloTensorLoadOpConverter
    : public BaseOpConversion<mlir::TensorLoadOp> {
 public:
  using BaseOpConversion<mlir::TensorLoadOp>::BaseOpConversion;
  LogicalResult matchAndRewrite(
      mlir::TensorLoadOp op, ArrayRef<Value> operands,
      ConversionPatternRewriter& rewriter) const final {
    rewriter.replaceOp(op, operands);
    return success();
  }
 };
 // TODO(b/137624192): Rewrite into a copy and elide copy if possible.
 class HloToLhloTensorStoreOpConverter
    : public BaseOpConversion<mlir::TensorStoreOp> {
 public:
  using BaseOpConversion<mlir::TensorStoreOp>::BaseOpConversion;
  LogicalResult matchAndRewrite(
      mlir::TensorStoreOp op, ArrayRef<Value> operands,
      ConversionPatternRewriter& rewriter) const final {
    rewriter.replaceOpWithNewOp<xla_lhlo::CopyOp>(
        op, llvm::None, operands.front(), operands.back());
    return success();
  }
 };
 // Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
 // buffers if necessary.
 //
 // Example fusion with HLO ops.
 //
 // func @fusion(%arg0: memref<2x2xf32>,
 //              %arg1: memref<2x2xf32>,
 //              %arg2: memref<2x2xf32>,
 //              %arg3: memref<2x2xf32>) {
 //   "xla_lhlo.fusion"() ({
 //     %0 = tensor_load %arg1 : memref<2x2xf32>
 //     %1 = tensor_load %arg2 : memref<2x2xf32>
 //     %2 = "xla_hlo.add"(%0, %1) :
 //         (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 //     %3 = tensor_load %arg0 : memref<2x2xf32>
 //     %4 = "xla_hlo.multiply"(%2, %3) :
 //         (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 //     tensor_store %4, %arg3 : memref<2x2xf32>
 //     "xla_lhlo.terminator"() : () -> ()
 //   }) : () -> ()
 //   return
 // }
 //
 // Transformed fusion with LHLO ops.
 // func @fusion(%arg0: memref<2x2xf32>,
 //              %arg1: memref<2x2xf32>,
 //              %arg2: memref<2x2xf32>,
 //              %arg3: memref<2x2xf32>) {
 //   "xla_lhlo.fusion"() ( {
 //     %0 = alloc() : memref<2x2xf32>
 //     "xla_lhlo.add"(%arg1, %arg2, %0) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     "xla_lhlo.multiply"(%0, %arg0, %arg3) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     "xla_lhlo.terminator"() : () -> ()
 //   }) : () -> ()
 //   return
 // }
 //
 // FuncOp signature conversion example:
 //
 // func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 //   %0 = "xla_hlo.maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) ->
 //   tensor<4xf32> %1 = "xla_hlo.add"(%arg0, %0)  : (tensor<4xf32>,
 //   tensor<4xf32>) -> tensor<4xf32> return %1 : tensor<4xf32>
 // }
 //
 // Transformed function with an extra argument for the result. The types have
 // been converted from tensor to memref.
 //
 // func @func_op(%arg0: memref<4xf32>,
 //               %arg1: memref<4xf32>,
 //               %arg2: memref<4xf32>) {
 //   %0 = alloc() : memref<4xf32>
 //   "xla_lhlo.maximum"(%arg0, %arg1, %0) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
 //   %1 = alloc() : memref<4xf32>
 //   "xla_lhlo.add"(%arg0, %0, %1) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
 //   "xla_lhlo.copy"(%1, %arg2) : (memref<4xf32>, memref<4xf32>) -> ()
 //   "xla_lhlo.terminator"() : () -> ()
 // }
 struct HloLegalizeToLhlo
    : public PassWrapper<HloLegalizeToLhlo, OperationPass<ModuleOp>> {
 public:
  HloLegalizeToLhlo() = default;
  HloLegalizeToLhlo(const HloLegalizeToLhlo& o) {
    this->results_escape_function = o.results_escape_function.getValue();
  }
  explicit HloLegalizeToLhlo(bool results_escape_function) {
    this->results_escape_function.setValue(results_escape_function);
  }
  void runOnOperation() override {
    OwningRewritePatternList patterns;
    auto& context = getContext();
    ConversionTarget target(context);
    target.addLegalDialect<xla_lhlo::XlaLhloDialect>();
    target.addLegalDialect<StandardOpsDialect>();
    target.addLegalOp<ModuleOp>();
    target.addIllegalOp<mlir::TensorLoadOp>();
    target.addIllegalOp<mlir::TensorStoreOp>();
    target.addLegalOp<ModuleTerminatorOp>();
    target.addLegalOp<TensorFromElementsOp>();
    target.addIllegalDialect<xla_hlo::XlaHloDialect>();
    BufferAssignmentTypeConverter converter;
    target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
      auto inputs = op.getType().getInputs();
      return llvm::all_of(inputs,
                          [](Type input) { return input.isa<MemRefType>(); }) &&
             converter.isLegal(&op.getBody());
    });
    target.addDynamicallyLegalOp<mlir::ReturnOp>([&](mlir::ReturnOp returnOp) {
      return std::all_of(returnOp.operand_type_begin(),
                         returnOp.operand_type_end(),
                         [](Type type) { return type.isa<MemRefType>(); });
    });
    auto module = getOperation();
    WalkResult result = module.walk([&](FuncOp func) -> WalkResult {
      BufferAssignmentPlacer bufferAssignment(func);
      OwningRewritePatternList patterns;
      populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
                                         &converter, &patterns);
      if (results_escape_function) {
        populateWithBufferAssignmentOpConversionPatterns<
            mlir::ReturnOp, mlir::ReturnOp, xla_lhlo::CopyOp,
            /*allowMemrefFunctionResults=*/true>(&context, &bufferAssignment,
                                                 &converter, &patterns);
      } else {
        populateWithBufferAssignmentOpConversionPatterns<
            mlir::ReturnOp, mlir::ReturnOp, xla_lhlo::CopyOp,
            /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
                                                  &converter, &patterns);
      }
      return applyPartialConversion(func, target, patterns);
    });
    if (result.wasInterrupted()) {
      signalPassFailure();
    }
  }
 private:
  Option<bool> results_escape_function{
      *this, "results-escape-function",
      llvm::cl::desc(
          "Allocate the results of functions within the functions body"),
      llvm::cl::init(false)};
 };
 }  // namespace
 void populateHLOToLHLOConversionPattern(
    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
    TypeConverter* converter, OwningRewritePatternList* patterns) {
  // clang-format off
  patterns->insert<
      HloToLhloDynamicBroadcastInDimOpConverter,
      HloToLhloOpConverter<xla_hlo::AbsOp>,
      HloToLhloOpConverter<xla_hlo::AddOp>,
      HloToLhloOpConverter<xla_hlo::AndOp>,
      HloToLhloOpConverter<xla_hlo::BroadcastInDimOp>,
      HloToLhloOpConverter<xla_hlo::CeilOp>,
      HloToLhloOpConverter<xla_hlo::CompareOp>,
      HloToLhloOpConverter<xla_hlo::ComplexOp>,
      HloToLhloOpConverter<xla_hlo::ConstOp>,
      HloToLhloOpConverter<xla_hlo::ConvOp>,
      HloToLhloOpConverter<xla_hlo::ConvertOp>,
      HloToLhloOpConverter<xla_hlo::CopyOp>,
      HloToLhloOpConverter<xla_hlo::CosOp>,
      HloToLhloOpConverter<xla_hlo::DivOp>,
      HloToLhloOpConverter<xla_hlo::DotOp>,
      HloToLhloOpConverter<xla_hlo::ExpOp>,
      HloToLhloOpConverter<xla_hlo::GatherOp>,
      HloToLhloOpConverter<xla_hlo::ImagOp>,
      HloToLhloOpConverter<xla_hlo::IotaOp>,
      HloToLhloOpConverter<xla_hlo::LogOp>,
      HloToLhloOpConverter<xla_hlo::MaxOp>,
      HloToLhloOpConverter<xla_hlo::MinOp>,
      HloToLhloOpConverter<xla_hlo::MulOp>,
      HloToLhloOpConverter<xla_hlo::NegOp>,
      HloToLhloOpConverter<xla_hlo::RealOp>,
      HloToLhloOpConverter<xla_hlo::RemOp>,
      HloToLhloOpConverter<xla_hlo::RsqrtOp>,
      HloToLhloOpConverter<xla_hlo::ReshapeOp>,
      HloToLhloOpConverter<xla_hlo::SelectOp>,
      HloToLhloOpConverter<xla_hlo::SignOp>,
      HloToLhloOpConverter<xla_hlo::SqrtOp>,
      HloToLhloOpConverter<xla_hlo::SubOp>,
      HloToLhloOpConverter<xla_hlo::TanhOp>,
      HloToLhloReduceOpConverter,
      HloToLhloTensorLoadOpConverter,
      HloToLhloTensorStoreOpConverter
  >(context, bufferAssignment, converter);
  // clang-format on
 }
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
    bool results_escape_function) {
  return absl::make_unique<HloLegalizeToLhlo>(results_escape_function);
 }
 static PassRegistration<HloLegalizeToLhlo> legalize_pass(
    "hlo-legalize-to-lhlo", "Legalize from HLO dialect to LHLO dialect");
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
+++ b/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
@ -0,0 +1,237 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering XLA dialect to Standard dialect.
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/STLExtras.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringSwitch.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Casting.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Block.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/BlockAndValueMapping.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Builders.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/TypeUtilities.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/PassRegistry.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Support/LogicalResult.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 using mlir::PassRegistration;
 namespace mlir {
 namespace xla_hlo {
 namespace {
 struct LegalizeControlFlow
    : public mlir::PassWrapper<LegalizeControlFlow, FunctionPass> {
  // Perform the lowering to MLIR control flow.
  void runOnFunction() override;
 };
 // Replaces terminators for the newly created blocks from a targe region.
 // These terminators are replaced with branch operations to a target block.
 LogicalResult ReplaceTerminators(Region* region, Block* target_block,
                                 Location loc,
                                 const BlockAndValueMapping& mapper,
                                 OpBuilder* builder) {
  for (auto& old_block : region->getBlocks()) {
    Block* block = mapper.lookup(&old_block);
    auto return_op = dyn_cast<xla_hlo::ReturnOp>(block->getTerminator());
    if (!return_op) continue;
    builder->setInsertionPointToEnd(block);
    builder->create<mlir::BranchOp>(loc, target_block, return_op.getOperands());
    return_op.erase();
  }
  return success();
 }
 LogicalResult LowerIfOp(mlir::xla_hlo::IfOp if_op) {
  Operation* op_inst = if_op.getOperation();
  mlir::OpBuilder builder(if_op);
  auto orig_block = op_inst->getBlock();
  auto* tail_block = orig_block->splitBlock(op_inst);
  auto loc = if_op.getLoc();
  // Duplicate the true and false regions in the block between the sections
  // before and after the conditional.
  BlockAndValueMapping mapper;
  if_op.true_branch().cloneInto(orig_block->getParent(),
                                Region::iterator(tail_block), mapper);
  if_op.false_branch().cloneInto(orig_block->getParent(),
                                 Region::iterator(tail_block), mapper);
  // Determine the blocks for the start of the true and false regions.
  Block* true_block = mapper.lookup(&if_op.true_branch().front());
  Block* false_block = mapper.lookup(&if_op.false_branch().front());
  // Perform the conditional branch into the true/false cases.
  builder.setInsertionPointToEnd(orig_block);
  // Extract the predicate for checking branching, then branch to the true and
  // false regions appropriately.
  auto cond_value = builder.create<mlir::ExtractElementOp>(loc, if_op.pred());
  builder.create<mlir::CondBranchOp>(loc, cond_value, true_block,
                                     if_op.true_arg(), false_block,
                                     if_op.false_arg());
  // Replace the true case's return operations with a branch to the tail of
  // the condition.
  if (failed(ReplaceTerminators(&if_op.true_branch(), tail_block, loc, mapper,
                                &builder)))
    return failure();
  if (failed(ReplaceTerminators(&if_op.false_branch(), tail_block, loc, mapper,
                                &builder)))
    return failure();
  tail_block->addArguments(if_op.getResult().getType());
  if_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
  op_inst->erase();
  return success();
 }
 LogicalResult LowerWhileOp(mlir::xla_hlo::WhileOp while_op) {
  // Converts an XLA while loop into control flow. This generates a set of MLIR
  // blocks and branches, along with inlining the regions provided by the XLA
  // while loop. The structure should be similar to below:
  //
  //   <prior operations>
  //   %0 = "xla_hlo.while"(%arg0) {^cond(...){...}, ^body(...){...}}
  //   <post operations>
  auto* op_inst = while_op.getOperation();
  mlir::OpBuilder builder(while_op);
  auto loc = while_op.getLoc();
  // Break the block into four sections:
  // orig_block - operations before the while and the branch into looping check.
  // tail_block - operations after the while loop completes.
  // cond_block - check the looping condition, then conditionally branch into
  //              the loop or, if condition is false, jump to the tail branch.
  // body_block - inlined loop body, then jump back to the condition block.
  auto* orig_block = op_inst->getBlock();
  auto* tail_block = orig_block->splitBlock(op_inst);
  BlockAndValueMapping mapper;
  while_op.cond().cloneInto(orig_block->getParent(),
                            Region::iterator(tail_block), mapper);
  while_op.body().cloneInto(orig_block->getParent(),
                            Region::iterator(tail_block), mapper);
  // Lookup the entry blocks for both condition and body.
  auto* cond_block = mapper.lookup(&while_op.cond().front());
  auto* body_block = mapper.lookup(&while_op.body().front());
  // Setup the end of the original block:
  //     <prior operations>
  //     br ^cond(%arg0) // Jumps to the condition statement.
  builder.setInsertionPointToEnd(orig_block);
  builder.create<mlir::BranchOp>(loc, cond_block, while_op.getOperand());
  // Updates the inlined condition blocks by replacing the return op with an
  // extract_element and conditional branch. This changes the block below:
  //   ^cond(%0):
  //     <inlined conditional region>
  //    "xla_hlo".return(%1)
  //
  //  Into:
  //   ^cond(%0):
  //     <inlined conditional region>
  //     %2 = extract_element %1[] : tensor<i1> // Extract the condition value.
  //     cond_br %2, ^body(%0), ^tail(%0) // Branch.
  builder.setInsertionPointToStart(cond_block);
  // Replace the xla_hlo::ReturnOp with a branch back to the condition block.
  // This is required as the xla_hlo::ReturnOp is used to mark the end of a
  // block for regions nested inside of a operations (MLIR ReturnOp cannot be
  // nested within an non-function region).
  for (auto& block : while_op.cond()) {
    auto new_block = mapper.lookup(&block);
    auto return_op = dyn_cast<xla_hlo::ReturnOp>(new_block->getTerminator());
    if (!return_op) continue;
    builder.setInsertionPointToEnd(new_block);
    auto return_value = return_op.getOperand(0);
    auto cond_value = builder.create<mlir::ExtractElementOp>(loc, return_value);
    // Get the body block arguments.
    llvm::SmallVector<Value, 4> successor_args(cond_block->args_begin(),
                                               cond_block->args_end());
    builder.create<mlir::CondBranchOp>(loc, cond_value, body_block,
                                       successor_args, tail_block,
                                       successor_args);
    return_op.erase();
  }
  // Updates the body blocks by replace the return op with an branch to the
  // conditional block. This changes the block below:
  //   ^body(%0):
  //     <inlined body block>
  //    "xla_hlo".return(%1)
  //
  //  Into:
  //   ^body(%0):
  //     <inlined body block>
  //     br ^cond(%0) // Branch.
  for (auto& block : while_op.body()) {
    auto new_block = mapper.lookup(&block);
    auto return_op =
        dyn_cast<mlir::xla_hlo::ReturnOp>(new_block->getTerminator());
    if (!return_op) continue;
    builder.setInsertionPointToEnd(new_block);
    builder.create<mlir::BranchOp>(loc, cond_block, return_op.getOperands());
    return_op.erase();
  }
  // Erase the original while loop.
  tail_block->addArgument(while_op.getType());
  while_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
  op_inst->erase();
  return success();
 }
 void LegalizeControlFlow::runOnFunction() {
  auto func = getFunction();
  llvm::SmallVector<IfOp, 4> if_ops;
  func.walk([&](IfOp op) { if_ops.push_back(op); });
  for (auto& op : if_ops) {
    if (failed(LowerIfOp(op))) return signalPassFailure();
  }
  llvm::SmallVector<WhileOp, 4> while_ops;
  func.walk([&](WhileOp op) { while_ops.push_back(op); });
  for (auto& op : while_ops) {
    if (failed(LowerWhileOp(op))) return signalPassFailure();
  }
 }
 }  // namespace
 }  // namespace xla_hlo
 }  // namespace mlir
 std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
 mlir::xla_hlo::createLegalizeControlFlowPass() {
  return std::make_unique<LegalizeControlFlow>();
 }
 static PassRegistration<mlir::xla_hlo::LegalizeControlFlow> legalize_cf_pass(
    "xla-legalize-control-flow",
    "Legalize from XLA control flow to MLIR control flow");
--- a/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
+++ b/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
@ -0,0 +1,156 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering the tanh standard ops to an
 // approximation.
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla {
 namespace {
 /// Emits the fast tanh approximation that is also used by XLA.
 Value EmitTanhApproximation(Value input, Location loc,
                            PatternRewriter &rewriter) {
  // For small values of x, we can approximate tanh(x)=x. For extremely small
  // values of x (|x| < 1e-37), the other approximation would evaluate
  // tanh(x) = 0.
  constexpr float kCanUseApprox = 0.0004;
  Value abs_value = rewriter.create<AbsFOp>(loc, input);
  Value can_use_approx =
      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kCanUseApprox));
  Value return_input = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT,
                                               abs_value, can_use_approx);
  // Clamp the input to [-c, c].
  Value max_clamp = rewriter.create<ConstantOp>(
      loc, rewriter.getF32FloatAttr(7.90531110763549805f));
  Value smaller_than_max =
      rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, input, max_clamp);
  Value clamped_half =
      rewriter.create<SelectOp>(loc, smaller_than_max, input, max_clamp);
  Value min_clamp = rewriter.create<ConstantOp>(
      loc, rewriter.getF32FloatAttr(-7.90531110763549805f));
  Value larger_than_min =
      rewriter.create<CmpFOp>(loc, CmpFPredicate::UGE, clamped_half, min_clamp);
  Value input_clamped =
      rewriter.create<SelectOp>(loc, larger_than_min, clamped_half, min_clamp);
  static constexpr std::array<float, 7> numerator_coeffs{
      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
      4.89352455891786e-03f};
  static constexpr std::array<float, 4> denominator_coeffs{
      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
      4.89352518554385e-03f};
  Value input_squared =
      rewriter.create<MulFOp>(loc, input_clamped, input_clamped);
  Value numerator = rewriter.create<ConstantOp>(
      loc, rewriter.getF32FloatAttr(numerator_coeffs[0]));
  for (int i = 1; i < numerator_coeffs.size(); i++) {
    numerator = rewriter.create<AddFOp>(
        loc, rewriter.create<MulFOp>(loc, input_squared, numerator),
        rewriter.create<ConstantOp>(
            loc, rewriter.getF32FloatAttr(numerator_coeffs[i])));
  }
  numerator = rewriter.create<MulFOp>(loc, input_clamped, numerator);
  Value denominator = rewriter.create<ConstantOp>(
      loc, rewriter.getF32FloatAttr(denominator_coeffs[0]));
  for (int i = 1; i < denominator_coeffs.size(); i++) {
    denominator = rewriter.create<AddFOp>(
        loc, rewriter.create<MulFOp>(loc, input_squared, denominator),
        rewriter.create<ConstantOp>(
            loc, rewriter.getF32FloatAttr(denominator_coeffs[i])));
  }
  Value approx = rewriter.create<DivFOp>(loc, numerator, denominator);
  return rewriter.create<SelectOp>(loc, return_input, input, approx);
 }
 class ApproximateTanhLowering : public OpRewritePattern<TanhOp> {
 public:
  explicit ApproximateTanhLowering(MLIRContext *ctx)
      : OpRewritePattern<TanhOp>(ctx, 100) {}
  LogicalResult matchAndRewrite(TanhOp tanhOp,
                                PatternRewriter &rewriter) const override {
    Type operand_type = tanhOp.getType();
    if (operand_type.isF64()) {
      // Similar to XLA, do not rewrite f64 as precision might matter.
      return failure();
    }
    Location loc = tanhOp.getLoc();
    Value input = tanhOp.operand();
    if (operand_type.isF16()) {
      input = rewriter.create<FPExtOp>(loc, input, rewriter.getF32Type());
    }
    // If we still do not have f32, fail.
    if (!input.getType().isF32()) {
      return failure();
    }
    Value result = EmitTanhApproximation(input, loc, rewriter);
    // Truncate back if needed.
    if (operand_type.isF16()) {
      result = rewriter.create<FPTruncOp>(loc, result, rewriter.getF16Type());
    }
    rewriter.replaceOp(tanhOp, {result});
    return success();
  }
 };
 struct LegalizeTanhToApproximation
    : public PassWrapper<LegalizeTanhToApproximation, FunctionPass> {
  /// Perform the lowering of standard dialect operations to approximations.
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    PopulateTanhToApproximationPatterns(&getContext(), &patterns);
    applyPatternsAndFoldGreedily(getFunction(), patterns);
  }
 };
 }  // anonymous namespace
 std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
 createLegalizeTanhToApproximationPass() {
  return std::make_unique<LegalizeTanhToApproximation>();
 }
 void PopulateTanhToApproximationPatterns(mlir::MLIRContext *context,
                                         OwningRewritePatternList *patterns) {
  patterns->insert<ApproximateTanhLowering>(context);
 }
 static PassRegistration<LegalizeTanhToApproximation> legalize_pass(
    "xla-legalize-tanh-to-approximation",
    "Legalize tanh from standard dialect to an approximation");
 }  // namespace xla
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
+++ b/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
@ -0,0 +1,208 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering XLA dialect to Standard dialect.
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringSwitch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace {
 #include "third_party/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/generated_legalize_to_standard.inc"
 }  // end anonymous namespace
 namespace xla_hlo {
 namespace {
 class CompareIConvert : public OpRewritePattern<xla_hlo::CompareOp> {
 public:
  using OpRewritePattern::OpRewritePattern;
  LogicalResult matchAndRewrite(xla_hlo::CompareOp op,
                                PatternRewriter &rewriter) const override {
    auto lhs = op.lhs();
    auto rhs = op.rhs();
    auto lhs_type = lhs.getType().cast<TensorType>();
    auto rhs_type = rhs.getType().cast<TensorType>();
    // Broadcasting not supported by this rewrite.
    if (lhs_type.getShape() != rhs_type.getShape()) return failure();
    if (!lhs_type.getElementType().isSignlessInteger() ||
        !rhs_type.getElementType().isSignlessInteger())
      return failure();
    auto comparison_direction = op.comparison_direction();
    auto compare_predicate =
        llvm::StringSwitch<Optional<CmpIPredicate>>(comparison_direction)
            .Case("EQ", CmpIPredicate::eq)
            .Case("NE", CmpIPredicate::ne)
            .Case("LT", CmpIPredicate::slt)
            .Case("LE", CmpIPredicate::sle)
            .Case("GT", CmpIPredicate::sgt)
            .Case("GE", CmpIPredicate::sge)
            .Default(llvm::None);
    if (!compare_predicate.hasValue()) return failure();
    rewriter.replaceOpWithNewOp<CmpIOp>(op, compare_predicate.getValue(), lhs,
                                        rhs);
    return success();
  }
 };
 class CompareFConvert : public OpRewritePattern<xla_hlo::CompareOp> {
 public:
  using OpRewritePattern::OpRewritePattern;
  LogicalResult matchAndRewrite(xla_hlo::CompareOp op,
                                PatternRewriter &rewriter) const override {
    auto lhs = op.lhs();
    auto rhs = op.rhs();
    auto lhs_type = lhs.getType().cast<TensorType>();
    auto rhs_type = rhs.getType().cast<TensorType>();
    // Broadcasting not supported by this rewrite.
    if (lhs_type.getShape() != rhs_type.getShape()) return failure();
    if (!lhs_type.getElementType().isa<FloatType>() ||
        !rhs_type.getElementType().isa<FloatType>())
      return failure();
    auto comparison_direction = op.comparison_direction();
    auto compare_predicate =
        llvm::StringSwitch<Optional<CmpFPredicate>>(comparison_direction)
            .Case("EQ", CmpFPredicate::OEQ)
            .Case("NE", CmpFPredicate::UNE)
            .Case("LT", CmpFPredicate::OLT)
            .Case("LE", CmpFPredicate::OLE)
            .Case("GT", CmpFPredicate::OGT)
            .Case("GE", CmpFPredicate::OGE)
            .Default(llvm::None);
    if (!compare_predicate.hasValue()) return failure();
    rewriter.replaceOpWithNewOp<CmpFOp>(op, compare_predicate.getValue(), lhs,
                                        rhs);
    return success();
  }
 };
 // Replace IotaOp with an integer constant. A ConvertOp is added to
 // convert the integer constant to iota result type. For complex types, the real
 // part is replaced with the generated constant and the imaginary part is
 // replaced with zero tensor.
 class ConvertIotaOp : public OpRewritePattern<xla_hlo::IotaOp> {
 public:
  using OpRewritePattern::OpRewritePattern;
  LogicalResult matchAndRewrite(xla_hlo::IotaOp op,
                                PatternRewriter &rewriter) const override {
    auto output_type = op.getType().cast<ShapedType>();
    auto output_size = output_type.getNumElements();
    auto dimension = op.iota_dimension().getSExtValue();
    auto max_dim_size = output_type.getDimSize(dimension);
    auto element_type = output_type.getElementType();
    int bitwidth;
    auto complex_ty = element_type.dyn_cast<ComplexType>();
    Type int_or_float_ty = element_type;
    if (complex_ty) int_or_float_ty = complex_ty.getElementType();
    bitwidth = int_or_float_ty.getIntOrFloatBitWidth();
    llvm::SmallVector<APInt, 10> values;
    values.reserve(output_size);
    int64_t increase_stride = output_size;
    for (int i = 0; i <= dimension; i++) {
      increase_stride /= output_type.getDimSize(i);
    }
    int64_t current_value = 0;
    for (int i = 0; i < output_size; i++) {
      int64_t value = (current_value / increase_stride) % max_dim_size;
      values.push_back(APInt(bitwidth, value));
      ++current_value;
    }
    auto int_shape_type = RankedTensorType::get(
        output_type.getShape(),
        IntegerType::get(bitwidth, rewriter.getContext()));
    auto loc = op.getLoc();
    auto integer_const = rewriter.create<mlir::ConstantOp>(
        loc, DenseIntElementsAttr::get(int_shape_type, values));
    auto int_or_float_shape_ty =
        RankedTensorType::get(output_type.getShape(), int_or_float_ty);
    auto iota_const =
        rewriter.create<ConvertOp>(loc, int_or_float_shape_ty, integer_const);
    // For int/float types we are done, replace op and return.
    if (!complex_ty) {
      rewriter.replaceOp(op, iota_const.getResult());
      return success();
    }
    // For complex types, generate a constant tensor of zeroes for the imaginary
    // part and use iota_const for real part.
    auto zeroes = rewriter.create<mlir::ConstantOp>(
        loc, DenseIntElementsAttr::get(int_shape_type, APInt(bitwidth, 0)));
    auto imag_zeroes =
        rewriter.create<ConvertOp>(loc, int_or_float_shape_ty, zeroes);
    rewriter.replaceOpWithNewOp<xla_hlo::ComplexOp>(op, iota_const,
                                                    imag_zeroes);
    return success();
  }
 };
 }  // end anonymous namespace
 namespace {
 struct LegalizeToStandard
    : public PassWrapper<LegalizeToStandard, FunctionPass> {
  /// Perform the lowering to Standard dialect.
  void runOnFunction() override;
 };
 }  // end anonymous namespace
 std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> createLegalizeToStdPass() {
  return std::make_unique<LegalizeToStandard>();
 }
 void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
                              mlir::MLIRContext *ctx) {
  mlir::populateWithGenerated(ctx, patterns);
  patterns->insert<CompareFConvert, CompareIConvert, ConvertIotaOp>(ctx);
 }
 /// Perform the lowering to standard dialect.
 void LegalizeToStandard::runOnFunction() {
  OwningRewritePatternList patterns;
  mlir::xla_hlo::PopulateXlaToStdPatterns(&patterns, &getContext());
  applyPatternsAndFoldGreedily(getFunction(), patterns);
 }
 static PassRegistration<LegalizeToStandard> legalize_pass(
    "xla-legalize-to-std", "Legalize from XLA dialect to standard dialect");
 }  // end namespace xla_hlo
 }  // end namespace mlir
--- a/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
+++ b/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
@ -0,0 +1,71 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This is the legalization pattern definition file for XLA to StandardOps.
 include "third_party/llvm/llvm-project/mlir/include/mlir/IR/OpBase.td"
 include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td"
 include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
 //===----------------------------------------------------------------------===//
 // Nullary op patterns.
 //===----------------------------------------------------------------------===//
 def : Pat<(HLO_ConstOp ElementsAttr:$value),
          (ConstantOp $value)>;
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
 //===----------------------------------------------------------------------===//
 def IsSameSizePred : CPred<
    "$0.getType().cast<ShapedType>().getShape() "
    "== $1.getType().cast<ShapedType>().getShape()">;
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r),
          (AndOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r),
          (AddFOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r),
          (SubFOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r),
          (MulFOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r),
          (DivFOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r),
          (RemFOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r),
          (AddIOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r),
          (SubIOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r),
          (MulIOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r),
          (SignedDivIOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
 def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r),
          (SignedRemIOp $l, $r),
          [(IsSameSizeConstraint $l, $r)]>;
--- a/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
@ -0,0 +1,105 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements a pass to remove redundant LHLO copy operations.
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 // Removes LHLO copy operations that copy from allocated buffers to block
 // arguments. All uses of each buffer are replaced with the corresponding block
 // argument and the buffer is freed. Note that this pass only works in regions
 // with a single block.
 struct LhloCopyRemoval : mlir::PassWrapper<LhloCopyRemoval, OperationPass<>> {
  void runOnOperation() override {
    llvm::SmallVector<mlir::Operation*, 2> eraseList;
    auto operation = getOperation();
    operation->walk([&](mlir::xla_lhlo::CopyOp copyOp) {
      // If this region contains more than one block, then ignore this copy
      // operation.
      if (copyOp.getParentRegion()->getBlocks().size() > 1) {
        return;
      }
      mlir::Value fromOperand = copyOp.operand();
      mlir::Value toOperand = copyOp.output();
      // If the fromOperand value is a block argument or the toOperand
      // value is not a block argument, then ignore this copy operation.
      if (!fromOperand.getDefiningOp() || toOperand.getDefiningOp()) {
        return;
      }
      // The copy operation removal is illegal if there is at least a single use
      // of toOperand value that lies between the first use of fromOperand value
      // and the copy operation.
      auto fromOperandUsers = fromOperand.getUsers();
      auto firstUser = *fromOperandUsers.begin();
      for (auto op : fromOperandUsers) {
        if (op->isBeforeInBlock(firstUser)) firstUser = op;
      }
      for (auto op : toOperand.getUsers()) {
        if (op->isBeforeInBlock(copyOp) && firstUser->isBeforeInBlock(op)) {
          return;
        }
      }
      // TODO(DFKI): Use live variable analysis to solve aliasing issues among
      // block arguments.
      // Remove the associated alloc operation.
      auto allocOp = fromOperand.getDefiningOp();
      eraseList.push_back(allocOp);
      // Iterate over all uses of the fromOperand to find the associated
      // deallocOp (if any).
      for (auto op : fromOperandUsers) {
        if (isa<mlir::DeallocOp>(op)) {
          eraseList.push_back(op);
          break;
        }
      }
      // Replace all uses of the fromOperand with the toOperand. This rewires
      // all references pointing to the original alloc operation to the new
      // target operation in order to safely remove the copy op.
      fromOperand.replaceAllUsesWith(toOperand);
      copyOp.erase();
    });
    for (auto op : eraseList) {
      op->erase();
    }
  };
 };
 }  // namespace
 std::unique_ptr<Pass> createLhloCopyRemovalPass() {
  return absl::make_unique<LhloCopyRemoval>();
 }
 static PassRegistration<LhloCopyRemoval> copy_removal_pass(
    "lhlo-copy-removal", "Removes redundant LHLO copy operations");
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
@ -0,0 +1,151 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for fusing linalg ops obtained after LHLO
 // lowering.
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/ArrayRef.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/STLExtras.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/FoldUtils.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 using linalg::LinalgOp;
 class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
 public:
  LhloFuseLinalg() = default;
  LhloFuseLinalg(const LhloFuseLinalg&) {}
  LhloFuseLinalg(bool use_parallel_loops, llvm::ArrayRef<unsigned> tile_sizes) {
    tile_sizes_ = tile_sizes;
    use_parallel_loops_.setValue(use_parallel_loops);
  }
  void runOnFunction() override {
    auto func = getFunction();
    // TODO(pifon): Remove assumption that the function has a single block.
    if (!llvm::hasSingleElement(func)) {
      emitError(func.getLoc(), "The function needs to have a single block.");
      signalPassFailure();
      return;
    }
    // The fusion in Linalg is currently possible only when the consumer op is
    // tiled. In order to greedily fuse the ops, we have to start from the tiled
    // root linalg ops, i.e. linalg ops that write to output buffers of the
    // function or are returned in case of escaping allocations.
    llvm::SmallDenseSet<Value> result_buffers;
    for (auto func_arg : func.getArguments()) {
      result_buffers.insert(func_arg);
    }
    for (auto& block : func) {
      auto returnOp = mlir::dyn_cast<mlir::ReturnOp>(block.getTerminator());
      if (!returnOp) continue;
      for (auto operand : returnOp.getOperands()) {
        result_buffers.insert(operand);
      }
    }
    MLIRContext* ctx = func.getContext();
    OpBuilder b(func);
    OperationFolder folder(ctx);
    func.walk([&](linalg::GenericOp generic_op) {
      SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
                                         tile_sizes_.end());
      if (tile_sizes.empty()) {
        tile_sizes = SmallVector<int64_t, 2>(generic_op.getNumLoops(), 1);
      }
      auto op = cast<LinalgOp>(generic_op.getOperation());
      for (const Value result : op.getOutputBuffers()) {
        if (!result_buffers.count(result)) continue;
        if (tileGenericOp(op, tile_sizes, &b)) {
          generic_op.erase();
          return;
        }
      }
    });
    auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
    applyPatternsAndFoldGreedily(func, patterns);
    // Fuse producers of tiled linalg ops.
    llvm::SmallDenseSet<Operation*> erase_set;
    SmallVector<Operation*, 8> linalg_ops;
    func.walk([&](LinalgOp op) { linalg_ops.push_back(op); });
    for (auto* op : llvm::reverse(linalg_ops)) {
      for (unsigned id = 0, e = LinalgOp(op).getNumInputs(); id < e; ++id) {
        linalg::Aliases aliases;
        linalg::LinalgDependenceGraph graph(aliases, linalg_ops);
        if (auto info = fuseProducerOf(b, op, id, graph, &folder)) {
          auto originalOp = info->originalProducer.getOperation();
          erase_set.insert(originalOp);
          auto originalOpInLinalgOpsVector = std::find_if(
              linalg_ops.begin(), linalg_ops.end(),
              [&](const Operation* op) { return op == originalOp; });
          *originalOpInLinalgOpsVector = info->fusedProducer.getOperation();
        }
      }
      auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
      applyPatternsAndFoldGreedily(func, patterns);
    }
    for (auto* e : erase_set) e->erase();
  }
 private:
  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b) {
    auto loopType = use_parallel_loops_
                        ? linalg::LinalgTilingLoopType::ParallelLoops
                        : linalg::LinalgTilingLoopType::Loops;
    auto tiled_generic_op = linalg::tileLinalgOp(*b, op,
                                                 linalg::LinalgTilingOptions()
                                                     .setTileSizes(tile_sizes)
                                                     .setLoopType(loopType));
    return tiled_generic_op.hasValue();
  }
  Option<bool> use_parallel_loops_{
      *this, "use-parallel-loops",
      llvm::cl::desc(
          "Tiles GenericOp consumer to parallel loops before linalg fusion"),
      llvm::cl::init(false)};
  ListOption<unsigned> tile_sizes_{
      *this, "tile-sizes",
      llvm::cl::desc(
          "Tile sizes by which to tile linalg generic before linalg fusion"),
      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createLhloFuseLinalg(
    bool use_parallel_loops, ArrayRef<unsigned> tile_sizes) {
  return absl::make_unique<LhloFuseLinalg>(use_parallel_loops, tile_sizes);
 }
 static PassRegistration<LhloFuseLinalg> legalize_pass(
    "lhlo-fuse-linalg",
    "Greedily fuse linalg ops obtained after LHLO lowering.");
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
@ -0,0 +1,161 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering LHLO dialect to Affine dialect.
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Location.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_xla_to_scalar_op.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 // Builds an affine loop nest iterating from zeros to "upper_bounds" with unit
 // steps, and populates the body of the innermost loop using "body_builder".
 static void BuildBoundedAffineLoopNest(
    OpBuilder& builder, Location location, ArrayRef<int64_t> upper_bounds,
    function_ref<void(OpBuilder&, Location, ValueRange)> body_builder) {
  SmallVector<int64_t, 3> lower_bounds(upper_bounds.size(), /*Value=*/0);
  SmallVector<int64_t, 3> steps(upper_bounds.size(), /*Value=*/1);
  buildAffineLoopNest(builder, location, lower_bounds, upper_bounds, steps,
                      body_builder);
 }
 struct DotOpConverter : public OpRewritePattern<DotOp> {
  using OpRewritePattern<DotOp>::OpRewritePattern;
  // Supports only rank-2 tensors for LHS and RHS.
  LogicalResult matchAndRewrite(DotOp op,
                                PatternRewriter& rewriter) const override {
    Value lhs = op.lhs();
    Value rhs = op.rhs();
    MemRefType lhs_type = lhs.getType().cast<MemRefType>();
    MemRefType rhs_type = rhs.getType().cast<MemRefType>();
    Type element_type = lhs_type.getElementType();
    ArrayRef<int64_t> shape_lhs = lhs_type.getShape();
    ArrayRef<int64_t> shape_rhs = rhs_type.getShape();
    if ((lhs_type.getRank() != 2) || (rhs_type.getRank() != 2)) {
      return failure();
    }
    LogicalResult map_status = success();
    auto body_builder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
      SmallVector<Value, 2> lhs_indices{ivs[0], ivs[2]},
          rhs_indices{ivs[2], ivs[1]}, result_indices{ivs[0], ivs[1]};
      auto l = builder.create<AffineLoadOp>(loc, lhs, lhs_indices);
      auto r = builder.create<AffineLoadOp>(loc, rhs, rhs_indices);
      auto result =
          rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
          op, element_type, {l, r, result}, &builder);
      map_status = success(op_result != nullptr);
      if (failed(map_status)) return;
      builder.create<AffineStoreOp>(loc, op_result, op.output(),
                                    result_indices);
    };
    BuildBoundedAffineLoopNest(rewriter, op.getLoc(),
                               {shape_lhs[0], shape_rhs[1], shape_rhs[0]},
                               body_builder);
    if (failed(map_status)) return failure();
    rewriter.eraseOp(op);
    return success();
  }
 };
 template <typename LhloOpTy>
 struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
  using OpRewritePattern<LhloOpTy>::OpRewritePattern;
  LogicalResult matchAndRewrite(LhloOpTy op,
                                PatternRewriter& rewriter) const override {
    const auto& lhs = op.lhs();
    const auto& rhs = op.rhs();
    const auto& lhs_type = lhs.getType().template cast<MemRefType>();
    const auto& rhs_type = rhs.getType().template cast<MemRefType>();
    const auto& element_type = lhs_type.getElementType();
    if (lhs_type.getShape() != rhs_type.getShape()) {
      return failure();
    }
    LogicalResult map_status = success();
    auto body_builder = [&](OpBuilder& builder, Location loc,
                            ValueRange induction_vars) {
      auto l = builder.create<AffineLoadOp>(loc, lhs, induction_vars);
      auto r = builder.create<AffineLoadOp>(loc, rhs, induction_vars);
      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
          op, element_type, {l, r}, &builder);
      map_status = success(op_result != nullptr);
      if (failed(map_status)) return;
      rewriter.create<AffineStoreOp>(loc, op_result, op.out(), induction_vars);
    };
    BuildBoundedAffineLoopNest(rewriter, op.getLoc(), lhs_type.getShape(),
                               body_builder);
    if (failed(map_status)) return failure();
    rewriter.eraseOp(op);
    return success();
  }
 };
 void populateLHLOToAffineConversionPattern(MLIRContext* context,
                                           OwningRewritePatternList* patterns) {
  // clang-format off
  patterns->insert<
      BinaryOpConverter<xla_lhlo::AddOp>,
      BinaryOpConverter<xla_lhlo::AndOp>,
      BinaryOpConverter<xla_lhlo::DivOp>,
      BinaryOpConverter<xla_lhlo::MaxOp>,
      BinaryOpConverter<xla_lhlo::MinOp>,
      BinaryOpConverter<xla_lhlo::MulOp>,
      BinaryOpConverter<xla_lhlo::SubOp>,
      DotOpConverter>(context);
  // clang-format on
 }
 struct LhloLegalizeToAffine
    : public PassWrapper<LhloLegalizeToAffine, FunctionPass> {
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    auto func = getFunction();
    populateLHLOToAffineConversionPattern(func.getContext(), &patterns);
    applyPatternsAndFoldGreedily(func, patterns);
  }
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToAffinePass() {
  return absl::make_unique<LhloLegalizeToAffine>();
 }
 static PassRegistration<LhloLegalizeToAffine> legalize_pass(
    "lhlo-legalize-to-affine", "Legalize from LHLO dialect to affine dialect");
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
@ -0,0 +1,196 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering LHLO dialect to GPU dialect.
 #include <cstdint>
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/ArrayRef.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/GPU/GPUDialect.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/SCF/SCF.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/BlockAndValueMapping.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Builders.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Location.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_xla_to_scalar_op.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 // A simple translation of LHLO reduce operations to a corresponding gpu
 // launch operation. The transformation does no tiling and also only supports
 // 1d results.
 class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
 public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult matchAndRewrite(
      ReduceOp reduce_op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = reduce_op.getLoc();
    // Only support 1d reductions for now.
    int64_t size = 0;
    for (auto result : reduce_op.out()) {
      auto shaped_type = result.getType().dyn_cast<ShapedType>();
      if (!shaped_type || shaped_type.getRank() != 1) {
        return failure();
      }
      auto dim_size = shaped_type.getDimSize(0);
      if (size && size != dim_size) {
        return failure();
      }
      size = dim_size;
    }
    auto reducing_dimension = *reduce_op.dimensions().int_value_begin();
    // Require all inputs to have the same shape.
    int64_t reduce_dim_size = 0;
    for (auto input : reduce_op.operands()) {
      auto shaped_type = input.getType().dyn_cast<ShapedType>();
      if (!shaped_type || !shaped_type.hasStaticShape()) {
        return failure();
      }
      reduce_dim_size =
          shaped_type.getDimSize(reducing_dimension.getSExtValue());
    }
    // Create a launch that is parallel in the result dimension.
    auto block_size_x = rewriter.create<mlir::ConstantOp>(
        loc, rewriter.getIndexType(),
        rewriter.getIntegerAttr(rewriter.getIndexType(), size));
    auto one = rewriter.create<mlir::ConstantOp>(
        loc, rewriter.getIndexType(),
        rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
    auto launch_op = rewriter.create<mlir::gpu::LaunchOp>(
        loc, one, one, one, block_size_x, one, one);
    {
      OpBuilder::InsertionGuard guard(rewriter);
      rewriter.setInsertionPointToEnd(&launch_op.body().front());
      auto index = launch_op.getThreadIds().x;
      // Load the initial value and store it to the output.
      for (auto pair : llvm::zip(reduce_op.init_values(), reduce_op.out())) {
        auto init_value = rewriter.create<mlir::LoadOp>(loc, std::get<0>(pair));
        rewriter.create<mlir::StoreOp>(loc, init_value, std::get<1>(pair),
                                       ArrayRef<Value>{index});
      }
      // Insert a loop into the body to compute the reduction. The loop ranges
      // from [0.dim).
      auto zero = rewriter.create<mlir::ConstantOp>(
          loc, rewriter.getIndexType(),
          rewriter.getIntegerAttr(rewriter.getIndexType(), 0));
      // TODO(b/137624192) Use dimOp to make it shape independent.
      auto upper = rewriter.create<mlir::ConstantOp>(
          loc, rewriter.getIndexType(),
          rewriter.getIntegerAttr(rewriter.getIndexType(), reduce_dim_size));
      auto step = rewriter.create<mlir::ConstantOp>(
          loc, rewriter.getIndexType(),
          rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
      auto loop = rewriter.create<mlir::scf::ForOp>(loc, zero, upper, step);
      rewriter.setInsertionPointToStart(loop.getBody());
      // Compute memrefs for the value to reduce. This makes it easier to just
      // inline the body.
      auto output = *reduce_op.out().begin();
      // TODO(herhut) Move this to the SliceOp builder.
      auto resType = MemRefType::get(
          llvm::None, output.getType().cast<MemRefType>().getElementType(),
          makeStridedLinearLayoutMap(llvm::None,
                                     MemRefType::getDynamicStrideOrOffset(),
                                     rewriter.getContext()));
      auto accumulator = rewriter.create<mlir::linalg::SliceOp>(
          loc, resType, output, ArrayRef<Value>{launch_op.getThreadIds().x});
      llvm::SmallVector<Value, 4> indexings;
      auto input_buffer = *reduce_op.operands().begin();
      auto input_type = input_buffer.getType().cast<MemRefType>();
      for (int64_t dim = 0; dim < input_type.getRank(); ++dim) {
        indexings.push_back(dim == reducing_dimension
                                ? loop.getInductionVar()
                                : launch_op.getThreadIds().x);
      }
      // TODO(herhut) Move this to the SliceOp builder.
      auto input = *reduce_op.operand_begin();
      auto rhs = rewriter.create<mlir::linalg::SliceOp>(
          loc,
          MemRefType::get(
              llvm::None, input_type.getElementType(),
              makeStridedLinearLayoutMap(llvm::None,
                                         MemRefType::getDynamicStrideOrOffset(),
                                         rewriter.getContext())),
          input, indexings);
      // Now copy over the actual body of the reduction, leaving out the
      // terminator.
      BlockAndValueMapping mapping;
      mapping.map(reduce_op.body().front().getArgument(0), accumulator);
      mapping.map(reduce_op.body().front().getArgument(1), rhs);
      mapping.map(reduce_op.body().front().getArgument(2), accumulator);
      for (auto& nested : reduce_op.body().front().without_terminator()) {
        auto clone = rewriter.clone(nested, mapping);
        for (auto pair : llvm::zip(nested.getResults(), clone->getResults())) {
          mapping.map(std::get<0>(pair), std::get<1>(pair));
        }
      }
      // Finally, insert the terminator for the launchOp.
      rewriter.setInsertionPointToEnd(&launch_op.body().front());
      rewriter.create<mlir::gpu::TerminatorOp>(loc);
    }
    rewriter.eraseOp(reduce_op);
    return success();
  };
 };
 struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    ConversionTarget target(getContext());
    target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
                           gpu::GPUDialect, scf::SCFDialect, XlaLhloDialect>();
    target.addIllegalOp<ReduceOp>();
    auto func = getFunction();
    patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
    if (failed(applyPartialConversion(func, target, patterns))) {
      signalPassFailure();
    }
  }
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass() {
  return absl::make_unique<LhloLegalizeToGpu>();
 }
 static PassRegistration<LhloLegalizeToGpu> legalize_pass(
    "lhlo-legalize-to-gpu", "Legalize from LHLO dialect to GPU dialect");
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
@ -0,0 +1,136 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 struct StaticMemRefCastOpConverter
    : public ConvertOpToLLVMPattern<StaticMemRefCastOp> {
  using ConvertOpToLLVMPattern<StaticMemRefCastOp>::ConvertOpToLLVMPattern;
  LogicalResult matchAndRewrite(
      Operation *op, ArrayRef<Value> operands,
      ConversionPatternRewriter &rewriter) const override {
    auto loc = op->getLoc();
    auto cast_op = cast<StaticMemRefCastOp>(op);
    StaticMemRefCastOp::Adaptor operands_adaptor(operands);
    MemRefDescriptor sourceMemRef(operands_adaptor.operand());
    MemRefType targetMemRefType =
        cast_op.getResult().getType().cast<MemRefType>();
    auto llvmTargetDescriptorTy = typeConverter.convertType(targetMemRefType)
                                      .dyn_cast_or_null<LLVM::LLVMType>();
    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
      return failure();
    // Create descriptor.
    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
    Type llvmTargetElementTy = desc.getElementType();
    // Set allocated ptr.
    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
    allocated =
        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
    desc.setAllocatedPtr(rewriter, loc, allocated);
    // Set aligned ptr.
    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
    desc.setAlignedPtr(rewriter, loc, ptr);
    // Fill size and stride descriptors in memref.
    auto target_sizes = targetMemRefType.getShape();
    int64_t target_offset;
    llvm::SmallVector<int64_t, 4> target_strides;
    if (failed((getStridesAndOffset(targetMemRefType, target_strides,
                                    target_offset))))
      return failure();
    // Copy offset of `targetMemRef`.
    desc.setConstantOffset(rewriter, loc, target_offset);
    for (int i = 0, e = targetMemRefType.getRank(); i < e; ++i) {
      desc.setConstantSize(rewriter, loc, i, target_sizes[i]);
      desc.setConstantStride(rewriter, loc, i, target_strides[i]);
    }
    rewriter.replaceOp(op, {desc});
    return success();
  }
 };
 struct DynamicMemRefCastOpConverter
    : public ConvertOpToLLVMPattern<DynamicMemRefCastOp> {
  using ConvertOpToLLVMPattern<DynamicMemRefCastOp>::ConvertOpToLLVMPattern;
  LogicalResult matchAndRewrite(
      Operation *op, ArrayRef<Value> operands,
      ConversionPatternRewriter &rewriter) const override {
    auto loc = op->getLoc();
    auto cast_op = cast<DynamicMemRefCastOp>(op);
    DynamicMemRefCastOp::Adaptor operands_adaptor(operands);
    MemRefDescriptor sourceMemRef(operands_adaptor.operand());
    MemRefType targetMemRefType =
        cast_op.getResult().getType().cast<MemRefType>();
    auto llvmTargetDescriptorTy = typeConverter.convertType(targetMemRefType)
                                      .dyn_cast_or_null<LLVM::LLVMType>();
    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
      return failure();
    // Create descriptor.
    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
    Type llvmTargetElementTy = desc.getElementType();
    // Set allocated ptr.
    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
    allocated =
        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
    desc.setAllocatedPtr(rewriter, loc, allocated);
    // Set aligned ptr.
    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
    desc.setAlignedPtr(rewriter, loc, ptr);
    // Copy offset of `sourceMemRef`.
    desc.setOffset(rewriter, loc, sourceMemRef.offset(rewriter, loc));
    // Fill size and stride descriptors in memref.
    if (!cast_op.sizes().empty()) {
      auto sizes = operands_adaptor.sizes();
      auto strides = operands_adaptor.strides();
      for (int i = 0, e = targetMemRefType.getRank(); i < e; ++i) {
        desc.setSize(rewriter, loc, i, sizes[i]);
        desc.setStride(rewriter, loc, i, strides[i]);
      }
    }
    rewriter.replaceOp(op, {desc});
    return success();
  }
 };
 }  // namespace
 void PopulateLhloToLLVMConversionPatterns(const LowerToLLVMOptions &options,
                                          LLVMTypeConverter *converter,
                                          OwningRewritePatternList *patterns) {
  patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
      *converter, options);
 }
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
@ -0,0 +1,59 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 class TestLhloToLLVMPass
    : public ::mlir::PassWrapper<TestLhloToLLVMPass,
                                 ::mlir::OperationPass<::mlir::ModuleOp>> {
 public:
  void runOnOperation() override {
    ModuleOp m = getOperation();
    OwningRewritePatternList patterns;
    LLVMTypeConverter converter(m.getContext());
    populateStdToLLVMConversionPatterns(converter, patterns);
    PopulateLhloToLLVMConversionPatterns(
        LowerToLLVMOptions::getDefaultOptions(), &converter, &patterns);
    ConversionTarget target(getContext());
    target.addLegalDialect<LLVM::LLVMDialect>();
    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
    target.addIllegalDialect<XlaLhloDialect>();
    if (failed(applyFullConversion(m, target, patterns))) {
      signalPassFailure();
    }
  }
 };
 }  // namespace
 static PassRegistration<TestLhloToLLVMPass> legalize_lhlo_pass(
    "test-lhlo-legalize-to-llvm", "Legalize from LHLO dialect to LLVM.");
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
@ -0,0 +1,731 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/ArrayRef.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/STLExtras.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/SmallVector.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/SCF/SCF.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 // Clones and adapts the code in `lhlo_block` that works on buffers and has a
 // single output buffer to make it compatible with `operands` that have element
 // types of the respective buffers. Returns the computed value.
 //
 // Example. For `operands` with (f32, i32) types and a block with LHLO ops and
 // with signature:
 //   ^bb(%lhs: memref<f32>, %rhs: memref<i32>, %res: memref<i1>):
 //     <LHLO_ops>
 //
 // inserts necessary alloc and store ops to compute and return result that has
 // `i1` type.
 Value ApplySingleResultLhloCode(Location loc, ValueRange operands,
                                Block* lhlo_block, OpBuilder* b) {
  SmallVector<Value, 2> arg_bufs;
  for (auto arg_type : lhlo_block->getArgumentTypes()) {
    arg_bufs.push_back(b->create<AllocOp>(loc, arg_type.cast<MemRefType>()));
  }
  for (auto operand : llvm::enumerate(operands)) {
    b->create<StoreOp>(loc, operand.value(), arg_bufs[operand.index()]);
  }
  // Clone the ops from `lhlo_block`.
  BlockAndValueMapping mapping;
  mapping.map(lhlo_block->getArguments(), arg_bufs);
  for (auto& nested : lhlo_block->without_terminator()) {
    auto clone = b->clone(nested, mapping);
    mapping.map(nested.getResults(), clone->getResults());
  }
  return b->create<LoadOp>(loc, arg_bufs.back());
 }
 // Converts a block with LHLO ops and with signature:
 //   ^bb(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
 // into a reduction operator of scf.reduce by doing buffer allocation for
 // scalar arguments and the result of `scf.reduce` to make it compatible with
 // LHLO ops.
 void ConvertToReductionOperator(Location loc, scf::ReduceOp reduce_op,
                                Block* lhlo_block, OpBuilder* b) {
  Block& loop_reduce_op_body = reduce_op.reductionOperator().front();
  OpBuilder::InsertionGuard guard(*b);
  b->setInsertionPointToStart(&loop_reduce_op_body);
  b->create<scf::ReduceReturnOp>(
      loc, ApplySingleResultLhloCode(loc, loop_reduce_op_body.getArguments(),
                                     lhlo_block, b));
 }
 // Returns result of ConstantOp if `dim` is static, otherwise uses DimOp to
 // extract dimension at runtime.
 Value GetStaticOrDynamicDim(mlir::Location loc, Value shaped_value,
                            size_t dim_index, int64_t dim, OpBuilder* b) {
  return dim == ShapedType::kDynamicSize
             ? b->create<DimOp>(loc, shaped_value, dim_index).getResult()
             : b->create<ConstantIndexOp>(loc, dim);
 }
 struct MappedIvs {
  // False if the mapped indices are in the padding area, true otherwise.
  Value in_bounds;
  // Mapped indices.
  SmallVector<Value, 2> ivs;
 };
 template <typename OpTy>
 MappedIvs MapWindowIvsToInput(OpTy op, ValueRange ivs, ValueRange window_ivs,
                              OpBuilder* b) {
  MappedIvs mapped_ivs;
  if (!op.window_strides().hasValue()) {
    op.emitOpError("No window strides specified.");
  }
  auto window_strides = op.window_strides().getValue();
  if (!op.padding().hasValue()) {
    op.emitOpError("No padding specified.");
  }
  auto padding = op.padding().getValue();
  auto loc = op.getLoc();
  auto operand = op.operand();
  auto operand_shape = operand.getType().template cast<MemRefType>().getShape();
  // `in_bounds` is false when the mapped indices are in the padding area.
  mapped_ivs.in_bounds = b->create<mlir::ConstantOp>(
      loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 1));
  for (unsigned i = 0, e = ivs.size(); i < e; ++i) {
    auto stride = window_strides.template getValue<llvm::APInt>(i);
    auto pad_low = padding.template getValue<llvm::APInt>({i, 0});
    Value stride_val = b->create<ConstantIndexOp>(loc, stride.getSExtValue());
    Value pad_low_val = b->create<ConstantIndexOp>(loc, pad_low.getSExtValue());
    Value center = b->create<MulIOp>(loc, ivs[i], stride_val);
    Value offset = b->create<SubIOp>(loc, window_ivs[i], pad_low_val);
    Value index = b->create<AddIOp>(loc, center, offset);
    Value upper_bound =
        GetStaticOrDynamicDim(loc, operand, i, operand_shape[i], b);
    // We must check whether 0 <= index_i < shape_i, as otherwise we are in
    // the pad and then we have to use the neutral element for reduction.
    // Equivalently, it can be computed as the unsigned comparison index_i <
    // shape_i, since a negative value wraps to a large positive value.
    mapped_ivs.in_bounds = b->create<mlir::AndOp>(
        loc, mapped_ivs.in_bounds,
        b->create<CmpIOp>(loc, CmpIPredicate::ult, index, upper_bound));
    mapped_ivs.ivs.push_back(index);
  }
  return mapped_ivs;
 }
 // Returns scf::Parallel over a shaped value with static or dynamic shape.
 scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
                                  OpBuilder* b) {
  Value zero = b->create<ConstantIndexOp>(loc, 0);
  Value one = b->create<ConstantIndexOp>(loc, 1);
  ArrayRef<int64_t> shape =
      shaped_value.getType().cast<ShapedType>().getShape();
  SmallVector<Value, 2> lower, upper, step;
  for (auto dim : llvm::enumerate(shape)) {
    upper.push_back(
        GetStaticOrDynamicDim(loc, shaped_value, dim.index(), dim.value(), b));
    lower.push_back(zero);
    step.push_back(one);
  }
  return b->create<scf::ParallelOp>(loc, lower, upper, step);
 }
 // Converts `xla_lhlo.ReduceOp` into two scf::ParallelOp and a scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops if there are
 // any. The inner `ParalleOp` refers to the reduction loops and `ReduceOp`
 // contains the reduction operator.
 //
 // Example:
 //
 //  "xla_lhlo.reduce"(%buffer, %init_buf, %result) ( {
 //    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
 //      <LHLO ops>
 //    } ) {dimensions = dense<[1]> : tensor<1xi64>}
 //      : (memref<100x10x5xf32>, memref<f32>, memref<100x5xf32>) -> ()
 //
 //  is roughly converted into:
 //
 //  %init = load %init_buf[] : memref<f32>
 //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
 //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
 //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
 //      scf.reduce(%elem_to_reduce)  {
 //        ^bb0(%elem: f32, %acc: f32):   // no predecessors
 //          elem_buf = alloc() : memref<f32>
 //          store %elem, elem_buf[] : memref<f32>
 //          acc_buf = alloc() : memref<f32>
 //          store %acc, acc_buf[] : memref<f32>
 //          <LHLO_ops>
 //          %acc_result = load acc_buf[] : memref<f32>
 //          scf.reduce.return %acc_result : f32
 //      } : f32
 //      scf.yield
 //    } : f32
 //    scf.yield
 //  }
 class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 public:
  using OpConversionPattern<xla_lhlo::ReduceOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::ReduceOp xla_reduce_op, ArrayRef<Value> /*args*/,
      ConversionPatternRewriter& rewriter) const final {
    // TODO(b/137624192) Implement variadic reduce.
    if (xla_reduce_op.out().size() != 1) return failure();
    scf::ReduceOp reduce_op =
        CreateReduceOpInNestedParallelLoops(xla_reduce_op, &rewriter);
    ConvertToReductionOperator(xla_reduce_op.getLoc(), reduce_op,
                               &xla_reduce_op.body().front(), &rewriter);
    rewriter.replaceOp(xla_reduce_op, llvm::None);
    return success();
  }
 private:
  // Creates nested `scf.parallel` ops with `scf.reduce`. The outer ParallelOp
  // refers to the parallel dimensions of `xla_reduce_op` if any and the inner
  // ParallelOp refers to the reduction dimensions. The scf.reduce op is
  // returned.
  //
  // If the reduction argument is a memref<100x10x5xf32> and the
  // reduction is performed along dimension 1 then this method will generate
  //
  //  %init = load %init_buf[] : memref<f32>
  //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
  //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
  //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
  //      scf.reduce(%elem_to_reduce)  {
  //        <THE BLOCK PTR TO BE RETURNED>
  //      } : f32
  //      scf.yield
  //    } : f32
  //    scf.yield
  //  }
  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
      xla_lhlo::ReduceOp xla_reduce_op,
      ConversionPatternRewriter* rewriter) const {
    auto loc = xla_reduce_op.getLoc();
    DenseSet<int> reducing_dims;
    for (const auto& rdim : xla_reduce_op.dimensions().getIntValues()) {
      reducing_dims.insert(rdim.getSExtValue());
    }
    Value operand = *xla_reduce_op.operands().begin();
    Value out = *xla_reduce_op.out().begin();
    SmallVector<Value, 2> parallel_lower, parallel_upper, parallel_step;
    SmallVector<Value, 2> reduce_lower, reduce_upper, reduce_step;
    auto operand_shape = operand.getType().cast<MemRefType>().getShape();
    for (auto dim : llvm::enumerate(operand_shape)) {
      const bool is_reducing_dim = reducing_dims.count(dim.index());
      Value ub = GetStaticOrDynamicDim(loc, operand, dim.index(), dim.value(),
                                       rewriter);
      Value lb = rewriter->create<ConstantIndexOp>(loc, 0);
      Value step = rewriter->create<ConstantIndexOp>(loc, 1);
      (is_reducing_dim ? reduce_lower : parallel_lower).push_back(lb);
      (is_reducing_dim ? reduce_upper : parallel_upper).push_back(ub);
      (is_reducing_dim ? reduce_step : parallel_step).push_back(step);
    }
    // Load initial value from memref<element_type>.
    SmallVector<Value, 1> init_value = {
        rewriter->create<LoadOp>(loc, *xla_reduce_op.init_values().begin())};
    // Outer ParallelOp is not needed if it is a reduction across all dims.
    scf::ParallelOp outer;
    if (!parallel_lower.empty()) {
      outer = rewriter->create<scf::ParallelOp>(loc, parallel_lower,
                                                parallel_upper, parallel_step);
      rewriter->setInsertionPointToStart(outer.getBody());
    }
    scf::ParallelOp inner = rewriter->create<scf::ParallelOp>(
        loc, reduce_lower, reduce_upper, reduce_step, ValueRange(init_value));
    Value reduction_result = *inner.getResults().begin();
    SmallVector<Value, 1> out_indices;
    if (outer != nullptr) {
      out_indices.reserve(outer.getNumLoops());
      for (Value iv : outer.getInductionVars()) {
        out_indices.push_back(iv);
      }
    } else {
      out_indices.push_back(rewriter->create<ConstantIndexOp>(loc, 0));
    }
    rewriter->create<StoreOp>(loc, reduction_result, out, out_indices);
    // Load the element to reduce.
    SmallVector<Value, 2> indices;
    indices.reserve(operand_shape.size());
    if (outer) {
      auto inner_ivs_it = inner.getInductionVars().begin();
      auto outer_ivs_it = outer.getInductionVars().begin();
      for (unsigned i = 0, e = operand_shape.size(); i < e; ++i) {
        indices.push_back(reducing_dims.count(i) ? *inner_ivs_it++
                                                 : *outer_ivs_it++);
      }
    } else {
      indices = inner.getInductionVars();
    }
    rewriter->setInsertionPointToStart(inner.getBody());
    Value elem = rewriter->create<mlir::LoadOp>(
        loc, *xla_reduce_op.operands().begin(), indices);
    return rewriter->create<scf::ReduceOp>(loc, elem);
  }
 };
 // Pseudocode:
 // for each index O in output
 //   accumulator = neutral_value
 //   in_bounds = true
 //   for each index W in window
 //     for each dimension i from 0 to rank - 1
 //       index = O[i] * stride[i] + W[i] - pad_low[i]
 //       in_bounds = inbounds && (index `ult` shape[i])
 //       I[i] = index
 //     if (in_bounds)
 //       value = input[I]
 //     else
 //       value = neutral_value
 //     accumulator = reduction_operator(output[O], value)
 //   output[O] = accumulator
 //
 // Converts `xla_lhlo.ReduceWindowOp` into two scf::ParallelOp and a
 // scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops that traverese output
 // buffer. The inner `ParalleOp` refers to the reduction loops that traverse
 // reduction windows and `ReduceOp` contains the reduction operator.
 //
 // Example:
 //
 // func @reduce_window(%arg: memref<112x112xf32>,
 //              %init: memref<f32>,
 //              %result: memref<56x56xf32>) {
 //   "xla_lhlo.reduce_window"(%arg, %init, %result) ( {
 //     ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
 //       "xla_lhlo.maximum"(%lhs, %rhs, %res)
 //         : (memref<f32>, memref<f32>, memref<f32>) -> ()
 //       "xla_lhlo.terminator"() : () -> ()
 //     }) {
 //       padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>,
 //       window_dimensions = dense<[3, 3]> : tensor<2xi64>,
 //       window_strides = dense<[2, 2]> : tensor<2xi64>
 //     } : (memref<112x112xf32>, memref<f32>, memref<56x56xf32>) -> ()
 //   return
 // }
 //
 // is roughly converted into:
 //
 //    %neutral_elem = load %init_buf[] : memref<f32>
 //    scf.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
 //      %result = scf.parallel (%iw, %jw) = (%c0, %c0)
 //                  to (%c3, %c3) step (%c1, %c1) neutral_elem (%0) -> f32 {
 //        %in_bounds = <COMPUTE IF INDEX IS IN OPERAND'S pad>
 //        %elem = load %operand[%computed_i, %computed_j]
 //        %elem_or_neutral = select %in_bounds, %elem, %neutral_elem : f32
 //        scf.reduce(%elem_to_reduce)  : f32 {
 //          ^bb0(%arg7: f32, %arg8: f32):
 //            <LHLO ops>
 //        }
 //        scf.yield
 //      }
 //      store %result, %output_buffer[%i, %j] : memref<56x56xf32>
 //      scf.yield
 //    }
 //    return
 //  }
 class ReduceWindowOpConverter
    : public OpConversionPattern<xla_lhlo::ReduceWindowOp> {
 public:
  using OpConversionPattern<xla_lhlo::ReduceWindowOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::ReduceWindowOp xla_reduce_window_op, ArrayRef<Value> /*args*/,
      ConversionPatternRewriter& rewriter) const final {
    scf::ParallelOp output_loop, window_loop;
    std::tie(output_loop, window_loop) =
        CreateParallelLoopsToTraverseOutputAndWindow(xla_reduce_window_op,
                                                     &rewriter);
    scf::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
        xla_reduce_window_op, output_loop, window_loop, &rewriter);
    ConvertToReductionOperator(xla_reduce_window_op.getLoc(), reduce_op,
                               &xla_reduce_window_op.body().front(), &rewriter);
    rewriter.replaceOp(xla_reduce_window_op, llvm::None);
    return success();
  }
 private:
  std::pair<scf::ParallelOp, scf::ParallelOp>
  CreateParallelLoopsToTraverseOutputAndWindow(
      xla_lhlo::ReduceWindowOp xla_reduce_window_op,
      ConversionPatternRewriter* rewriter) const {
    auto loc = xla_reduce_window_op.getLoc();
    Value init_value =
        rewriter->create<LoadOp>(loc, xla_reduce_window_op.init_value());
    Value zero = rewriter->create<ConstantIndexOp>(loc, 0);
    Value one = rewriter->create<ConstantIndexOp>(loc, 1);
    // Create an outer parallel loop that spans the output of ReduceWindowOp.
    Value xla_output = xla_reduce_window_op.out();
    auto output_loop = MakeLoopOverShape(loc, xla_output, rewriter);
    // Create a nested loop that traverses the window.
    SmallVector<Value, 2> window_lower, window_upper, window_step;
    rewriter->setInsertionPointToStart(output_loop.getBody());
    for (const auto& window_dim : xla_reduce_window_op.window_dimensions()) {
      window_step.push_back(one);
      window_lower.push_back(zero);
      window_upper.push_back(
          rewriter->create<ConstantIndexOp>(loc, window_dim.getSExtValue()));
    }
    auto window_loop = rewriter->create<scf::ParallelOp>(
        loc, window_lower, window_upper, window_step, ValueRange(init_value));
    Value reduction_result = *window_loop.getResults().begin();
    auto output_ivs = output_loop.getInductionVars();
    rewriter->create<StoreOp>(loc, reduction_result, xla_output, output_ivs);
    return std::make_pair(output_loop, window_loop);
  }
  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
      xla_lhlo::ReduceWindowOp xla_reduce_window_op,
      scf::ParallelOp output_loop, scf::ParallelOp window_loop,
      ConversionPatternRewriter* rewriter) const {
    rewriter->setInsertionPointToStart(window_loop.getBody());
    auto loc = xla_reduce_window_op.getLoc();
    if (xla_reduce_window_op.base_dilations().hasValue() ||
        xla_reduce_window_op.window_dilations().hasValue()) {
      xla_reduce_window_op.emitRemark(
          "Lowering to parallel loops does not support `base_dilations` or "
          "`window_dilations` attributes yet. The attributes will be ignored.");
    }
    Value xla_operand = xla_reduce_window_op.operand();
    auto xla_operand_type = xla_operand.getType().cast<MemRefType>();
    // Compute ivs in 'arg' buffer and whether these ivs are in pad area or not.
    MappedIvs mapped_ivs = MapWindowIvsToInput(
        xla_reduce_window_op, output_loop.getInductionVars(),
        window_loop.getInductionVars(), rewriter);
    auto elem_or_init = rewriter->create<scf::IfOp>(
        loc, xla_operand_type.getElementType(), mapped_ivs.in_bounds,
        /*withElseRegion=*/true);
    OpBuilder then_builder = elem_or_init.getThenBodyBuilder();
    Value elem = then_builder.create<mlir::LoadOp>(
        loc, xla_reduce_window_op.operand(), mapped_ivs.ivs);
    then_builder.create<scf::YieldOp>(loc, elem);
    OpBuilder else_builder = elem_or_init.getElseBodyBuilder();
    else_builder.create<scf::YieldOp>(loc, *window_loop.initVals().begin());
    return rewriter->create<scf::ReduceOp>(loc,
                                           *elem_or_init.results().begin());
  }
 };
 // See the operation semantics in
 // https://www.tensorflow.org/xla/operation_semantics#selectandscatter
 //
 // Pseudocode:
 //  scf.parallel(coordinates O in the output):
 //    output[O] = init
 //  scf.parallel(coordinates S in the source):
 //    selected_ivs = 0
 //    selected_val = 0
 //    initialized_flag = false
 //    scf.for (first dim W_1 in the window)
 //         iter_args (selected_ivs, selected_val, initialized_flag):
 //    ...
 //      scf.for (last dim W_N in the window):
 //           iter_args (selected_ivs, selected_val, initialized_flag):
 //        I = S * stride + W - pad_low
 //        if I within bounds of operand:
 //          if (initialized_flag):
 //            pred = select(selected_value, operand(I))):
 //            if (pred)
 //              selected_value = operand(I)
 //              selected_index = I
 //          else
 //              selected_value = operand(I)
 //              selected_index = I
 //              initialized_flag = true
 //    output(selected_index) = scatter(output(selected_index), source(S))
 class SelectAndScatterOpConverter
    : public OpConversionPattern<xla_lhlo::SelectAndScatterOp> {
 public:
  using OpConversionPattern<xla_lhlo::SelectAndScatterOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::SelectAndScatterOp s_and_s_op, ArrayRef<Value> /*args*/,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = s_and_s_op.getLoc();
    InitializeOutput(s_and_s_op, &rewriter);
    scf::ParallelOp loop_over_src =
        MakeLoopOverShape(loc, s_and_s_op.source(), &rewriter);
    rewriter.setInsertionPointToStart(loop_over_src.getBody());
    // Compute indices of the selected element in the window.
    auto selected_ivs = SelectIvs(s_and_s_op, loop_over_src, &rewriter);
    // Load `source[selected_ivs]`.
    auto src_elem = rewriter.create<LoadOp>(loc, s_and_s_op.source(),
                                            loop_over_src.getInductionVars());
    // Compute `out[selected_ivs]` = scatter(out[selected_ivs], src_element)`.
    auto rmw = rewriter.create<GenericAtomicRMWOp>(loc, s_and_s_op.out(),
                                                   selected_ivs);
    OpBuilder rmw_builder = OpBuilder::atBlockEnd(rmw.getBody());
    auto acc_result =
        ApplySingleResultLhloCode(loc, {src_elem, rmw.getCurrentValue()},
                                  &s_and_s_op.scatter().front(), &rmw_builder);
    rmw_builder.create<AtomicYieldOp>(loc, acc_result);
    rewriter.replaceOp(s_and_s_op, llvm::None);
    return success();
  }
 private:
  void InitializeOutput(xla_lhlo::SelectAndScatterOp s_and_s_op,
                        OpBuilder* b) const {
    auto loc = s_and_s_op.getLoc();
    Value init_value = b->create<LoadOp>(loc, s_and_s_op.init_value());
    scf::ParallelOp loop_over_output =
        MakeLoopOverShape(loc, s_and_s_op.out(), b);
    OpBuilder::InsertionGuard guard(*b);
    b->setInsertionPointToStart(loop_over_output.getBody());
    b->create<StoreOp>(loc, init_value, s_and_s_op.out(),
                       loop_over_output.getInductionVars());
  }
  struct WindowLoops {
    SmallVector<Value, 2> selected_ivs;
    SmallVector<Value, 2> window_ivs;
    scf::ForOp inner_loop;
  };
  WindowLoops InsertWindowLoops(xla_lhlo::SelectAndScatterOp s_and_s_op,
                                scf::ParallelOp loop_over_src,
                                OpBuilder* b) const {
    auto loc = s_and_s_op.getLoc();
    Value zero = b->create<ConstantIndexOp>(loc, 0);
    Value one = b->create<ConstantIndexOp>(loc, 1);
    auto element_type =
        s_and_s_op.out().getType().cast<MemRefType>().getElementType();
    auto rank = loop_over_src.getNumLoops();
    // `iter_args` = [iv_1, ..., iv_N, selected_value, is_initialized]
    SmallVector<Value, 4> iter_args(rank, zero);
    iter_args.push_back(b->create<mlir::ConstantOp>(
        loc, element_type, b->getFloatAttr(element_type, 0)));
    iter_args.push_back(b->create<mlir::ConstantOp>(
        loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 0)));
    // Create a nested loop that traverses the window.
    OpBuilder::InsertPoint ip;
    WindowLoops result;
    for (const auto& window_dim :
         s_and_s_op.window_dimensions()->getIntValues()) {
      Value upper = b->create<ConstantIndexOp>(loc, window_dim.getSExtValue());
      result.inner_loop =
          b->create<scf::ForOp>(loc, zero, upper, one, iter_args);
      if (b->getInsertionBlock() == loop_over_src.getBody()) {
        ip = b->saveInsertionPoint();
        result.selected_ivs = result.inner_loop.getResults().take_front(rank);
      } else {
        b->create<scf::YieldOp>(loc, result.inner_loop.getResults());
      }
      b->setInsertionPointToStart(result.inner_loop.getBody());
      iter_args = ValueRange{result.inner_loop.getRegionIterArgs()};
      result.window_ivs.push_back(result.inner_loop.getInductionVar());
    }
    b->restoreInsertionPoint(ip);
    return result;
  }
  // Adapter to store iteration arguments of sequential loops that perform
  // select in a window.
  class IterArgs {
   public:
    explicit IterArgs(ValueRange ivs_val_flag) : ivs_val_flag_(ivs_val_flag) {}
    IterArgs(ValueRange ivs, Value value, Value flag) {
      ivs_val_flag_ = ivs;
      ivs_val_flag_.push_back(value);
      ivs_val_flag_.push_back(flag);
    }
    ArrayRef<Value> to_vector() const { return ivs_val_flag_; }
    // Indices of the currently selected value.
    ArrayRef<Value> ivs() const { return to_vector().drop_back(2); }
    // Currently selected value w.r.t. select() function.
    Value value() const { return ivs_val_flag_.end()[-2]; }
    // i1 flag if value() and ivs() were initialized.
    Value is_init() const { return ivs_val_flag_.back(); }
   private:
    // Vector that stores iv_1, ..., iv_N, value, init.
    SmallVector<Value, 4> ivs_val_flag_;
  };
  SmallVector<Value, 2> SelectIvs(xla_lhlo::SelectAndScatterOp s_and_s_op,
                                  scf::ParallelOp loop_over_src,
                                  OpBuilder* b) const {
    auto loc = s_and_s_op.getLoc();
    WindowLoops window_loops = InsertWindowLoops(s_and_s_op, loop_over_src, b);
    auto inner_loop_b =
        OpBuilder::atBlockEnd(window_loops.inner_loop.getBody());
    // Compute ivs in 'arg' buffer and whether these ivs are in the pad area.
    MappedIvs mapped_ivs =
        MapWindowIvsToInput(s_and_s_op, loop_over_src.getInductionVars(),
                            window_loops.window_ivs, &inner_loop_b);
    IterArgs ivs_val_flag(window_loops.inner_loop.getRegionIterArgs());
    auto if_in_bounds = inner_loop_b.create<scf::IfOp>(
        loc, window_loops.inner_loop.getResultTypes(), mapped_ivs.in_bounds,
        /*withElseRegion=*/true);
    // Case when we are inside boundaries of 'arg' and not in the pad area.
    {
      OpBuilder in_bounds_then_b = if_in_bounds.getThenBodyBuilder();
      auto select_or_init_results = SelectOrInitialize(
          s_and_s_op, mapped_ivs.ivs, &ivs_val_flag, &in_bounds_then_b);
      in_bounds_then_b.create<scf::YieldOp>(loc, select_or_init_results);
    }
    // Case when we are in the pad.
    {
      OpBuilder in_bounds_else_b = if_in_bounds.getElseBodyBuilder();
      in_bounds_else_b.create<scf::YieldOp>(loc, ivs_val_flag.to_vector());
    }
    inner_loop_b.create<scf::YieldOp>(loc, if_in_bounds.getResults());
    return window_loops.selected_ivs;
  }
  SmallVector<Value, 4> SelectOrInitialize(
      xla_lhlo::SelectAndScatterOp s_and_s_op, ArrayRef<Value> operand_ivs,
      IterArgs* ivs_val_flag, OpBuilder* b) const {
    auto loc = s_and_s_op.getLoc();
    Value true_i1 = b->create<mlir::ConstantOp>(
        loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 1));
    TypeRange iter_arg_types{ivs_val_flag->to_vector()};
    Value operand_elem =
        b->create<LoadOp>(loc, s_and_s_op.operand(), operand_ivs);
    auto if_init =
        b->create<scf::IfOp>(loc, iter_arg_types, ivs_val_flag->is_init(),
                             /*withElseRegion=*/true);
    // Init == true, i.e. iter args are already initialized with a selected
    // element in boundaries of the operand. Select function has to be computed
    // here.
    {
      OpBuilder if_init_then_b = if_init.getThenBodyBuilder();
      auto& lhlo_select = s_and_s_op.select().front();
      Value pred =
          ApplySingleResultLhloCode(loc, {operand_elem, ivs_val_flag->value()},
                                    &lhlo_select, &if_init_then_b);
      auto if_pred = if_init_then_b.create<scf::IfOp>(loc, iter_arg_types, pred,
                                                      /*withElseRegion=*/true);
      // Pred == true, therefore pack newly selected ivs, val and init flag back
      // to iter_args and return.
      {
        OpBuilder if_pred_then_b = if_pred.getThenBodyBuilder();
        if_pred_then_b.create<scf::YieldOp>(
            loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
      }
      // Pred == false, therefore return old iter_args.
      {
        OpBuilder if_pred_else_b = if_pred.getElseBodyBuilder();
        if_pred_else_b.create<scf::YieldOp>(loc, ivs_val_flag->to_vector());
      }
      if_init_then_b.create<scf::YieldOp>(loc, if_pred.getResults());
    }
    // Init == false, i.e. only pad was visited before and this is the first
    // element in the boundaries of the operand.
    {
      OpBuilder if_init_else_b = if_init.getElseBodyBuilder();
      if_init_else_b.create<scf::YieldOp>(
          loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
    }
    return if_init.getResults();
  }
 };
 struct LhloLegalizeToParallelLoops
    : public PassWrapper<LhloLegalizeToParallelLoops, FunctionPass> {
  void runOnFunction() override {
    auto func = getFunction();
    OwningRewritePatternList patterns;
    // clang-format off
    patterns.insert<
        ReduceOpConverter,
        ReduceWindowOpConverter,
        SelectAndScatterOpConverter
      >(func.getContext());
    // clang-format on
    ConversionTarget target(getContext());
    target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
                           scf::SCFDialect, XlaLhloDialect>();
    target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
                        xla_lhlo::SelectAndScatterOp>();
    if (failed(applyPartialConversion(func, target, patterns))) {
      signalPassFailure();
    }
  }
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass() {
  return absl::make_unique<LhloLegalizeToParallelLoops>();
 }
 static PassRegistration<LhloLegalizeToParallelLoops> legalize_lhlo_pass(
    "lhlo-legalize-to-parallel-loops",
    "Legalize from LHLO dialect to parallel loops.");
 }  // namespace xla_lhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lower_complex.cc
+++ b/lib/Dialect/mhlo/transforms/lower_complex.cc
@ -0,0 +1,79 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Thsi file implements passes to convert complex operations to equivalent real
 // value operations. This does not include removing complex values from function
 // argument or return types.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <numeric>
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/STLExtras.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/TypeUtilities.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Types.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/PassRegistry.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
 using mlir::FunctionPass;
 using mlir::OwningRewritePatternList;
 using mlir::PassRegistration;
 using mlir::PassWrapper;
 namespace {
 class LowerComplex : public PassWrapper<LowerComplex, FunctionPass> {
 public:
  explicit LowerComplex() : PassWrapper<LowerComplex, FunctionPass>() {}
  /// Performs the lowering to XLA dialect.
  void runOnFunction() override;
 };
 }  // end anonymous namespace
 namespace mlir {
 namespace xla {
 namespace {
 #include "third_party/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/generated_lower_complex.inc"
 }  // end anonymous namespace
 void PopulateComplexLoweringPatterns(MLIRContext* context,
                                     OwningRewritePatternList* patterns) {
  populateWithGenerated(context, patterns);
 }
 }  // end namespace xla
 }  // end namespace mlir
 // Lowers the complex operations that can be represented using other operations.
 void LowerComplex::runOnFunction() {
  // Add lowering patterns to the list.
  OwningRewritePatternList patterns;
  mlir::xla::PopulateComplexLoweringPatterns(&getContext(), &patterns);
  applyPatternsAndFoldGreedily(getFunction(), patterns);
 }
 static PassRegistration<LowerComplex> pass(
    "test-xla-lower-complex",
    "Lower complex operations into non-complex operations");
--- a/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
+++ b/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
@ -0,0 +1,109 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This is the legalization pattern that converts complex operations into
 // equivalent real value operations.
 include "third_party/llvm/llvm-project/mlir/include/mlir/IR/OpBase.td"
 include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td"
 include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
 //===----------------------------------------------------------------------===//
 // Add and subtraction are elementwise and can be distributed across the real
 // and imaginary components.
 foreach elementwiseOp = [HLO_AddOp, HLO_SubOp] in
  def : Pat<(elementwiseOp HLO_ComplexTensor:$lhs,
             HLO_ComplexTensor:$rhs),
            (HLO_ComplexOp
              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs)),
              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs)))>;
 // Complex multiplication results in a cross product multiplication between the
 // real and imaginary components such that:
 //   result.real = lhs.real * rhs.real - lhs.imag * rhs.imag
 //   result.imag = lhs.imag * rhs.real + lhs.real * rhs.imag
 def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs,
           HLO_ComplexTensor:$rhs),
          (HLO_ComplexOp
           (HLO_SubOp
            (HLO_MulOp
             (HLO_RealOp:$lhs_real $lhs),
             (HLO_RealOp:$rhs_real $rhs)),
            (HLO_MulOp
             (HLO_ImagOp:$lhs_imag $lhs),
             (HLO_ImagOp:$rhs_imag $rhs))),
           (HLO_AddOp
            (HLO_MulOp $lhs_real, $rhs_imag),
            (HLO_MulOp $lhs_imag, $rhs_real)))>;
 // Multiplication between a complex and real tensor can be distributed by
 // applying the real multiplicant to both the real and complex component.
 //
 // Note that the sourcep pattern is not legal according to the HLO dialect but
 // instead handle intermediates generated by other patterns.
 def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
          (HLO_ComplexOp
           (HLO_MulOp (HLO_RealOp $lhs), $rhs),
           (HLO_MulOp (HLO_ImagOp $lhs), $rhs))>;
 def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs),
          (HLO_ComplexOp
           (HLO_MulOp $lhs, (HLO_RealOp $rhs)),
           (HLO_MulOp $lhs, (HLO_ImagOp $rhs)))>;
 // Division is performed by normalizing the denominator by multiplying by the
 // conjugate of the rhs.
 //   numerator = lhs * conj(rhs)
 //   denominator = rhs * conj(rhs)
 def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs),
            (HLO_DivOp
             (HLO_MulOp:$num $lhs,
              (HLO_ComplexOp:$conj
               (HLO_RealOp $rhs),
               (HLO_NegOp (HLO_ImagOp $rhs)))),
             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj)))>;
 def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
          (HLO_ComplexOp
           (HLO_DivOp (HLO_RealOp $lhs), $rhs),
           (HLO_DivOp (HLO_ImagOp $lhs), $rhs))>;
 // Absolute value is evaluated as:
 //   result = sqrt(val.real * val.real + val.imag * val.imag)
 def : Pat<(HLO_AbsOp HLO_ComplexTensor:$val),
          (HLO_ComplexOp
           (HLO_SqrtOp
             (HLO_AddOp
              (HLO_MulOp (HLO_RealOp:$real $val), $real),
              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag))),
           (HLO_ConstOp (ConstantSplat<"0"> $real)))>;
 // Exponential can be lowered to an exponential on the real component and a
 // sum of sinusoids of the imaginary component, which equates to a normal
 // exponential operator multiplied by Euler's formula.
 //
 // Exp(a + ib) = Exp(a) * Exp(ib) = Exp(a) * (Cos(b) + iSin(b))
 def : Pat<(HLO_ExpOp HLO_ComplexTensor:$val),
          (HLO_MulOp
           (HLO_ExpOp (HLO_RealOp $val)),
           (HLO_ComplexOp
            (HLO_CosOp (HLO_ImagOp:$imag $val)),
            (HLO_SinOp $imag)))>;
--- a/lib/Dialect/mhlo/transforms/lower_general_dot.cc
+++ b/lib/Dialect/mhlo/transforms/lower_general_dot.cc
@ -0,0 +1,194 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering XLA general dot to a regular dot.
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/STLExtras.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/StringSwitch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Location.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/TypeUtilities.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 using mlir::DenseIntElementsAttr;
 using mlir::ElementsAttr;
 using mlir::failure;
 using mlir::FunctionPass;
 using mlir::LogicalResult;
 using mlir::MLIRContext;
 using mlir::OpRewritePattern;
 using mlir::OwningRewritePatternList;
 using mlir::PassRegistration;
 using mlir::PassWrapper;
 using mlir::PatternRewriter;
 using mlir::RankedTensorType;
 using mlir::success;
 using mlir::Value;
 namespace {
 Value TransposeReshape(Value arg, mlir::Location loc,
                       llvm::ArrayRef<int64_t> left_dims,
                       llvm::ArrayRef<int64_t> right_dims,
                       llvm::ArrayRef<int64_t> arg_shape,
                       PatternRewriter *rewriter) {
  auto element_type = mlir::getElementTypeOrSelf(arg.getType());
  int64_t left_size = 1;
  for (auto dim : left_dims) {
    left_size *= arg_shape[dim];
  }
  int64_t right_size = 1;
  for (auto dim : right_dims) {
    right_size *= arg_shape[dim];
  }
  // Generate the transpose permutation attribute.
  llvm::SmallVector<int64_t, 5> transpose_permutation(left_dims.begin(),
                                                      left_dims.end());
  transpose_permutation.append(right_dims.begin(), right_dims.end());
  mlir::TensorType transpose_permutation_type = RankedTensorType::get(
      {static_cast<int64_t>(transpose_permutation.size())},
      rewriter->getIntegerType(64));
  auto transpose_permutation_attr =
      DenseIntElementsAttr::get(transpose_permutation_type,
                                llvm::makeArrayRef(transpose_permutation))
          .cast<DenseIntElementsAttr>();
  // Compute the resulting shape.
  llvm::SmallVector<int64_t, 5> transposed_shape;
  for (auto val : transpose_permutation) {
    transposed_shape.push_back(arg_shape[val]);
  }
  auto transpose_type = RankedTensorType::get(transposed_shape, element_type);
  auto transpose_result = rewriter->create<mlir::xla_hlo::TransposeOp>(
      loc, transpose_type, arg, transpose_permutation_attr);
  // Return the final result.
  auto reshaped_type =
      RankedTensorType::get({left_size, right_size}, element_type);
  return rewriter->create<mlir::xla_hlo::ReshapeOp>(loc, reshaped_type,
                                                    transpose_result);
 }
 Value ProcessDotArg(Value arg, mlir::Location loc,
                    ElementsAttr contract_dims_attr, bool outer_dims_first,
                    PatternRewriter *rewriter) {
  auto shape = arg.getType().cast<mlir::ShapedType>().getShape();
  llvm::SmallVector<bool, 5> is_outer_dim;
  is_outer_dim.resize(shape.size(), true);
  // Compute the contract dimension ordering.
  llvm::SmallVector<int64_t, 5> contract_dims;
  for (auto dim : contract_dims_attr.getValues<int64_t>()) {
    contract_dims.push_back(dim);
    is_outer_dim[dim] = false;
  }
  // Compute the outer dimension orderings.
  llvm::SmallVector<int64_t, 5> outer_dims;
  for (auto it : llvm::enumerate(is_outer_dim)) {
    if (it.value()) {
      outer_dims.push_back(it.index());
    }
  }
  if (outer_dims_first) {
    return TransposeReshape(arg, loc, outer_dims, contract_dims, shape,
                            rewriter);
  }
  return TransposeReshape(arg, loc, contract_dims, outer_dims, shape, rewriter);
 }
 struct GeneralDotConvert
    : public OpRewritePattern<mlir::xla_hlo::DotGeneralOp> {
  // Attempts to lower a General Dot operator to a standard Dot operator.
  // General dots include batching dimensions and can have collapsing
  // dimensions along any axis. Inserting correctly arrange transpose and
  // reshape operators organizes the tensors and allows the General Dot to be
  // replaced with the standard Dot operator.
  //
  // Note: This requires an empty list of batch dimensions.
  explicit GeneralDotConvert(MLIRContext *context)
      : OpRewritePattern(context) {}
  LogicalResult matchAndRewrite(mlir::xla_hlo::DotGeneralOp op,
                                PatternRewriter &rewriter) const override {
    auto dot_element_type = mlir::getElementTypeOrSelf(op);
    auto dot_numbers = op.dot_dimension_numbers();
    if (dot_numbers.lhs_batching_dimensions().getNumElements() != 0 ||
        dot_numbers.rhs_batching_dimensions().getNumElements() != 0) {
      return failure();
    }
    auto lhs = ProcessDotArg(op.lhs(), op.getLoc(),
                             dot_numbers.lhs_contracting_dimensions(),
                             /*outer_dims_first=*/true, &rewriter);
    auto rhs = ProcessDotArg(op.rhs(), op.getLoc(),
                             dot_numbers.rhs_contracting_dimensions(),
                             /*outer_dims_first=*/false, &rewriter);
    // Dot resulting shape.
    auto lhs_shape = lhs.getType().cast<mlir::ShapedType>().getShape();
    auto rhs_shape = rhs.getType().cast<mlir::ShapedType>().getShape();
    auto new_dot_type =
        RankedTensorType::get({lhs_shape[0], rhs_shape[1]}, dot_element_type);
    auto new_dot_op = rewriter.create<mlir::xla_hlo::DotOp>(
        op.getLoc(), new_dot_type, lhs, rhs, *(op.precision_config()));
    rewriter.replaceOpWithNewOp<mlir::xla_hlo::ReshapeOp>(op, op.getType(),
                                                          new_dot_op);
    return success();
  }
 };
 struct LegalizeGeneralDot
    : public PassWrapper<LegalizeGeneralDot, FunctionPass> {
  /// Lower all general dots that can be represented as a non-batched matmul.
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    mlir::xla_hlo::PopulateGeneralDotOpLoweringPatterns(&patterns,
                                                        &getContext());
    applyPatternsAndFoldGreedily(getFunction(), patterns);
  }
 };
 }  // namespace
 void mlir::xla_hlo::PopulateGeneralDotOpLoweringPatterns(
    OwningRewritePatternList *patterns, MLIRContext *ctx) {
  patterns->insert<GeneralDotConvert>(ctx);
 }
 static PassRegistration<LegalizeGeneralDot> legalize_pass(
    "test-xla-lower-general-dot",
    "Tests lowering general dot to a non-batched dot when possible");
--- a/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
+++ b/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
@ -0,0 +1,90 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <numeric>
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 // Converts ClampOp with broadcast semantics. ClampOp requires "all three arrays
 // must be the same shape. Alternatively, as a restricted form of broadcasting,
 // min and/or max can be a scalar of type T."
 struct ClampWithBroadcastConvert : public OpRewritePattern<ClampOp> {
  explicit ClampWithBroadcastConvert(MLIRContext *context)
      : OpRewritePattern<ClampOp>(context) {}
  LogicalResult matchAndRewrite(ClampOp op,
                                PatternRewriter &rewriter) const override {
    auto operand_type = op.operand().getType().dyn_cast<RankedTensorType>();
    auto max_type = op.max().getType().dyn_cast<RankedTensorType>();
    auto min_type = op.min().getType().dyn_cast<RankedTensorType>();
    // Unrancked types are not supported.
    if (!operand_type || !max_type || !min_type) return failure();
    // Does not support operand with dynamic dimensions for now.
    if (!operand_type.hasStaticShape()) return failure();
    ArrayRef<int64_t> operand_shape = operand_type.getShape();
    Value max_value = op.max();
    if (max_type != operand_type) {
      assert(max_type.getRank() == 0);
      max_value = rewriter.createOrFold<BroadcastOp>(
          op.getLoc(), operand_type, max_value,
          rewriter.getI64TensorAttr(operand_shape));
    }
    Value min_value = op.min();
    if (min_type != operand_type) {
      assert(min_type.getRank() == 0);
      min_value = rewriter.createOrFold<BroadcastOp>(
          op.getLoc(), operand_type, min_value,
          rewriter.getI64TensorAttr(operand_shape));
    }
    rewriter.replaceOpWithNewOp<ClampOp>(op, op.getType(), min_value,
                                         op.operand(), max_value);
    return success();
  }
 };
 }  // namespace
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                        ConversionTarget *conversionTarget) {
  conversionTarget->addDynamicallyLegalOp<ClampOp>([](ClampOp op) {
    return op.max().getType() == op.operand().getType() &&
           op.min().getType() == op.operand().getType();
  });
 }
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns) {
  // ClampOp. This op has a special case where it accepts either same-shaped
  // inputs or scalars (a restricted form of broadcasting). This makes the
  // broadcast explicit.
  patterns->insert<ClampWithBroadcastConvert>(context);
 }
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
+++ b/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
@ -0,0 +1,58 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 struct TestMaterializeBroadcastsPass
    : public PassWrapper<TestMaterializeBroadcastsPass, FunctionPass> {
  void runOnFunction() override {
    ConversionTarget conversionTarget(getContext());
    OwningRewritePatternList conversionPatterns;
    // Consider the xla_hlo dialect legal for tests.
    conversionTarget.addLegalDialect<XlaHloDialect>();
    // The conversion uses helpers from the Standard dialect.
    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
    SetupMaterializeBroadcastsLegality(&getContext(), &conversionTarget);
    PopulateMaterializeBroadcastsPatterns(&getContext(), &conversionPatterns);
    if (failed(applyPartialConversion(getFunction(), conversionTarget,
                                      conversionPatterns))) {
      return signalPassFailure();
    }
  }
 };
 }  // namespace
 }  // namespace xla_hlo
 }  // namespace mlir
 static mlir::PassRegistration<mlir::xla_hlo::TestMaterializeBroadcastsPass>
    pass("test-xla-materialize-broadcasts",
         "Test pass for materializing 'broadcast_dimensions' attributes");
--- a/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
+++ b/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
@ -0,0 +1,85 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/DenseMap.h"
 #include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Casting.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/PassManager.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Support/LLVM.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/RegionUtils.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 // A pass that sinks constants implicitly captured in control flow regions. This
 // is necessary to export to XLA.
 class SinkConstantsToControlFlow
    : public mlir::PassWrapper<SinkConstantsToControlFlow, FunctionPass> {
  void runOnFunction() override {
    getFunction().walk([](Operation* op) {
      if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
        SinkToRegion(&while_op.body());
        SinkToRegion(&while_op.cond());
      } else if (auto if_op = llvm::dyn_cast<IfOp>(op)) {
        SinkToRegion(&if_op.true_branch());
        SinkToRegion(&if_op.false_branch());
      }
    });
  }
 private:
  // Performs constant sinking into a region.
  static void SinkToRegion(Region* region) {
    llvm::DenseMap<Value, ConstOp> sunk_constant;
    visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
      Value constant = use->get();
      auto const_op = dyn_cast_or_null<ConstOp>(constant.getDefiningOp());
      if (!const_op) return;
      auto map_entry = sunk_constant.try_emplace(constant, nullptr);
      if (!map_entry.second) {
        // This constant has already been cloned into the region, reuse it.
        use->set(map_entry.first->getSecond().getResult());
        if (constant.use_empty()) const_op.erase();
        return;
      }
      if (constant.hasOneUse()) {
        const_op.getOperation()->moveBefore(&region->front().front());
        return;
      }
      map_entry.first->getSecond() = const_op.clone();
      region->front().getOperations().insert(region->front().begin(),
                                             map_entry.first->getSecond());
      use->set(map_entry.first->getSecond().getResult());
    });
  }
 };
 static mlir::PassRegistration<SinkConstantsToControlFlow> pass(
    "xla-hlo-sink-constants-to-control-flow",
    "Sink constants implicitly captured in control flow regions. This is "
    "necessary to export to XLA.");
 }  // anonymous namespace
 std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass() {
  return std::make_unique<SinkConstantsToControlFlow>();
 }
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
+++ b/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
@ -0,0 +1,100 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Identifier.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/OperationSupport.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Interfaces/InferTypeOpInterface.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 namespace mlir {
 namespace xla {
 namespace {
 struct InferReturnTypeComponentsPattern : public RewritePattern {
  InferReturnTypeComponentsPattern(MLIRContext *context)
      : RewritePattern("xla_test.get_return_type_components", 1, context) {}
  LogicalResult matchAndRewrite(Operation *op,
                                PatternRewriter &rewriter) const override {
    if (op->getNumOperands() != 1) return failure();
    auto defining_op = op->getOperand(0).getDefiningOp();
    auto defining_op_int =
        llvm::dyn_cast_or_null<InferShapedTypeOpInterface>(defining_op);
    if (!defining_op_int) return failure();
    SmallVector<ShapedTypeComponents, 4> components;
    if (failed(defining_op_int.inferReturnTypeComponents(
            op->getContext(), op->getLoc(), defining_op->getOperands(),
            defining_op->getAttrDictionary(), defining_op->getRegions(),
            components))) {
      return failure();
    }
    // Replace the op with another pass-through op with attributes added.
    OperationState state(op->getLoc(), "xla_test.return_type_components",
                         op->getOperands(), op->getResultTypes(),
                         op->getAttrs());
    auto new_op = rewriter.createOperation(state);
    for (auto it : llvm::enumerate(components)) {
      if (it.value().hasRank()) {
        new_op->setAttr((StringRef("dims") + Twine(it.index())).str(),
                        rewriter.getI64ArrayAttr(it.value().getDims()));
      }
      if (it.value().getElementType()) {
        new_op->setAttr((Twine("element_type") + Twine(it.index())).str(),
                        TypeAttr::get(it.value().getElementType()));
      }
    }
    rewriter.replaceOp(op, {new_op->getResults()});
    return success();
  }
 };
 struct ReifyReturnTypeShapesPattern : public RewritePattern {
  ReifyReturnTypeShapesPattern(MLIRContext *context)
      : RewritePattern("xla_test.reify_return_type_shapes", 1, context) {}
  LogicalResult matchAndRewrite(Operation *op,
                                PatternRewriter &rewriter) const override {
    if (op->getNumOperands() != 1) return failure();
    auto defining_op = llvm::dyn_cast_or_null<InferShapedTypeOpInterface>(
        op->getOperand(0).getDefiningOp());
    if (!defining_op) return failure();
    SmallVector<Value, 4> return_shapes;
    if (failed(defining_op.reifyReturnTypeShapes(rewriter, return_shapes))) {
      return failure();
    }
    rewriter.replaceOp(op, return_shapes);
    return success();
  }
 };
 struct TestInferShapedTypeMethodsPass
    : public PassWrapper<TestInferShapedTypeMethodsPass, FunctionPass> {
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    patterns.insert<ReifyReturnTypeShapesPattern>(&getContext());
    patterns.insert<InferReturnTypeComponentsPattern>(&getContext());
    applyPatternsAndFoldGreedily(getFunction(), patterns);
  }
 };
 }  // namespace
 }  // namespace xla
 }  // namespace mlir
 static mlir::PassRegistration<mlir::xla::TestInferShapedTypeMethodsPass> pass(
    "test-xla-infer-shaped-type-methods",
    "Uses test ops to invoke InferShapedTypeOpInterface methods");
--- a/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
+++ b/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
@ -0,0 +1,184 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/SmallVector.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Builders.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Types.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 // Broadcasts the 1D value tensor 'value_1d' to the shape of 'result_type'. If
 // 'shape_value' is initialized, creates a dynamic broadcast, otherwise creates
 // a static broadcast.
 Value BroadcastToFeatureDim(Location loc, RankedTensorType result_type,
                            Value value_1d, Value shape_value,
                            int64_t feature_dim,
                            PatternRewriter& rewriter) {  // NOLINT
  Builder b(rewriter.getContext());
  auto dims_type = RankedTensorType::get({1}, b.getIntegerType(64));
  auto dims = DenseIntElementsAttr::get(dims_type, {feature_dim});
  if (shape_value) {
    return rewriter.createOrFold<xla_hlo::DynamicBroadcastInDimOp>(
        loc, result_type, value_1d, shape_value, dims);
  }
  assert(result_type.hasStaticShape());
  return rewriter.create<xla_hlo::BroadcastInDimOp>(loc, result_type, value_1d,
                                                    dims);
 }
 // Calculate the shape value of operand, assuming it is a dynamic shape with
 // static rank.
 Value CalculateShapeValue(Location loc, Value operand,
                          PatternRewriter& rewriter) {  // NOLINT
  RankedTensorType result_type = operand.getType().dyn_cast<RankedTensorType>();
  llvm::SmallVector<Value, 4> shape_values;
  int64_t rank = result_type.getRank();
  shape_values.reserve(rank);
  for (int64_t i = 0; i < rank; ++i) {
    shape_values.push_back(rewriter.create<mlir::DimOp>(loc, operand, i));
  }
  return rewriter.create<TensorFromElementsOp>(loc, shape_values);
 }
 Value MaterializeEpsilon(Operation* op, FloatAttr epsilon_attr,
                         FloatType fp_type, Value variance,
                         RankedTensorType broadcast_to_type,
                         PatternRewriter& rewriter) {  // NOLINT
  Builder b(rewriter.getContext());
  if (epsilon_attr.getType() != fp_type) {
    // Need to convert.
    bool loses_info;
    APFloat epsilon_float = epsilon_attr.getValue();
    auto status = epsilon_float.convert(
        fp_type.getFloatSemantics(), APFloat::rmNearestTiesToEven, &loses_info);
    if ((status & (~APFloat::opInexact)) != APFloat::opOK) {
      op->emitWarning() << "Could not convert batch_norm epsilon to target fp "
                           "type: opStatus = "
                        << static_cast<int>(status);
      return nullptr;
    }
    if (loses_info) {
      op->emitWarning("Conversion of epsilon loses precision");
    }
    epsilon_attr = b.getFloatAttr(fp_type, epsilon_float);
  }
  auto scalar_type = RankedTensorType::get({}, fp_type);
  auto epsilon_tensor_attr =
      DenseElementsAttr::get(scalar_type, {epsilon_attr.cast<Attribute>()});
  Value epsilon =
      rewriter.create<xla_hlo::ConstOp>(op->getLoc(), epsilon_tensor_attr);
  auto dims_type = RankedTensorType::get({0}, b.getIntegerType(64));
  auto dims = DenseIntElementsAttr::get(dims_type, SmallVector<int64_t, 1>{});
  if (broadcast_to_type.hasStaticShape()) {
    return rewriter.create<xla_hlo::BroadcastInDimOp>(
        op->getLoc(), broadcast_to_type, epsilon, /*broadcast_dims=*/dims);
  }
  Value shape_value = CalculateShapeValue(op->getLoc(), variance, rewriter);
  return rewriter.createOrFold<xla_hlo::DynamicBroadcastInDimOp>(
      op->getLoc(), broadcast_to_type, epsilon, shape_value,
      /*broadcast_dims=*/dims);
 }
 class UnfuseBatchNormInferencePattern
    : public OpRewritePattern<xla_hlo::BatchNormInferenceOp> {
 public:
  using OpRewritePattern<xla_hlo::BatchNormInferenceOp>::OpRewritePattern;
  LogicalResult matchAndRewrite(xla_hlo::BatchNormInferenceOp bn_op,
                                PatternRewriter& rewriter) const override {
    // Enforce type invariants.
    // Note that we deduce the actual element type from the variance,
    // which should not be subject to quantization at a higher level.
    auto input_type = bn_op.operand().getType().dyn_cast<RankedTensorType>();
    auto variance_type =
        bn_op.variance().getType().dyn_cast<RankedTensorType>();
    if (!input_type || !variance_type) {
      return failure();
    }
    auto fp_type = variance_type.getElementType().dyn_cast<FloatType>();
    if (!fp_type) {
      return failure();
    }
    int64_t feature_dim = bn_op.feature_index().getSExtValue();
    // Add epsilon to the variance and sqrt to get stddev:
    // stddev = sqrt(variance + epsilon)
    auto epsilon =
        MaterializeEpsilon(bn_op.getOperation(), bn_op.epsilonAttr(), fp_type,
                           bn_op.variance(), variance_type, rewriter);
    if (!epsilon) {
      return failure();
    }
    Value stddev = rewriter.create<xla_hlo::AddOp>(bn_op.getLoc(),
                                                   bn_op.variance(), epsilon);
    stddev = rewriter.create<xla_hlo::SqrtOp>(bn_op.getLoc(), stddev);
    // Broadcast all terms.
    Value shape_value;
    if (!input_type.hasStaticShape()) {
      shape_value =
          CalculateShapeValue(bn_op.getLoc(), bn_op.operand(), rewriter);
    }
    auto broadcast_scale =
        BroadcastToFeatureDim(bn_op.getLoc(), input_type, bn_op.scale(),
                              shape_value, feature_dim, rewriter);
    auto broadcast_offset =
        BroadcastToFeatureDim(bn_op.getLoc(), input_type, bn_op.offset(),
                              shape_value, feature_dim, rewriter);
    auto broadcast_mean =
        BroadcastToFeatureDim(bn_op.getLoc(), input_type, bn_op.mean(),
                              shape_value, feature_dim, rewriter);
    auto broadcast_stddev = BroadcastToFeatureDim(
        bn_op.getLoc(), input_type, stddev, shape_value, feature_dim, rewriter);
    // Compute:
    // scale * (input - mean) / stddev + offset
    Value result = rewriter.create<xla_hlo::SubOp>(
        bn_op.getLoc(), bn_op.operand(), broadcast_mean);
    result = rewriter.create<xla_hlo::MulOp>(bn_op.getLoc(), result,
                                             broadcast_scale);
    result = rewriter.create<xla_hlo::DivOp>(bn_op.getLoc(), result,
                                             broadcast_stddev);
    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result,
                                                broadcast_offset);
    return success();
  }
 };
 }  // namespace
 // Populates conversion patterns to unfuse batch normalization operations.
 // In combination with marking such ops as illegal, this allows backends that
 // do not have special support for fused batchnorm to use simpler arithmetic
 // primitives.
 void PopulateUnfuseBatchNormPatterns(MLIRContext* context,
                                     OwningRewritePatternList* patterns) {
  patterns->insert<UnfuseBatchNormInferencePattern>(context);
 }
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
+++ b/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
@ -0,0 +1,46 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 struct TestUnfuseBatchNormPass
    : public PassWrapper<TestUnfuseBatchNormPass, OperationPass<>> {
  void runOnOperation() override {
    OwningRewritePatternList patterns;
    PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
    applyPatternsAndFoldGreedily(getOperation(), patterns);
  }
 };
 }  // namespace
 }  // namespace xla_hlo
 }  // namespace mlir
 static mlir::PassRegistration<mlir::xla_hlo::TestUnfuseBatchNormPass> pass(
    "test-xla-unfuse-batch-norm",
    "Test pass for materializing 'broadcast_dimensions' attributes");
--- a/lib/Dialect/mhlo/transforms/xla_hlo_fusion.cc
+++ b/lib/Dialect/mhlo/transforms/xla_hlo_fusion.cc
@ -0,0 +1,579 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"              // TF:llvm-project
 #include "mlir/IR/Matchers.h"
 #include "mlir/Pass/Pass.h"               // TF:local_config_mlir
 #include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/EquivalenceClasses.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/cycle_detector.h"
 // This pass has similar functionality of the fusion pass in XLA stack.
 // However, unlike XLA, it targets the fully dynamic shape scenario.
 // Currently, it implements the kLoop and kInput fusion templates.
 // During conversion, it tries to greedily find kLoop/kInput fusion
 // patterns.
 //
 // Similar to XLA, this pass supports fusion pattern having multiple outputs
 // if all the shape of outputs are consistent. Following are some examples.
 //
 //        kLoop                          kInput
 // +----+  +----+  +----+    +----+    +----+    +----+
 // |elem|  |elem|  |elem|    |elem<----+elem+---->elem+----+
 // +-+--+  +-+--+  +-+--+    +-+--+    +----+    +-+--+    |
 //   |       |       |         |                   |       |
 //   |               |         |                   |       |
 // +-v--+    |     +-v--+   +--v---+            +--v---+   |
 // |elem+<---+----<+elem|   |reduce|            |reduce|   |
 // +-+--+          +-+--+   +--+---+            +--+---+   |
 //   |               |         |                   |       |
 //   |               |         |                   |       |
 //   v               v         v                   v       v
 //
 // To this end, we also add an simple shape constraint analysis phase.
 // For kLoop fusion template, it requires all the outputs of the fused
 // pattern have the same shape. However, we don't know the actual value
 // of the shape at the compile time in the dynamic shape world.
 // Fortunately, we could still infer the relationship among different ops
 // according to their shape constrain traits. Currently, We only consider
 // shape equality propagation for elementwise ops (assuming that implicit
 // shape broadcast is forbidden). The above process could be built on the
 // shape dialect once it is ready.
 namespace mlir {
 namespace xla_hlo {
 namespace {
 using llvm::EquivalenceClasses;
 using FusionPattern = std::vector<Operation*>;
 using FusionPlan = std::vector<FusionPattern>;
 // To support using EquivalenceClasses for Value
 class ValueWrapper {
 public:
  explicit ValueWrapper(Value value) : value_(std::move(value)) {}
  Value getValue() const { return value_; }
  bool operator==(const ValueWrapper& rhs) const {
    return getValue() == rhs.getValue();
  }
 private:
  Value value_;
 };
 bool operator<(const ValueWrapper& lhs, const ValueWrapper& rhs) {
  auto lhs_value = lhs.getValue().getAsOpaquePointer();
  auto rhs_value = rhs.getValue().getAsOpaquePointer();
  return lhs_value < rhs_value;
 }
 bool IsFusible(Operation* op) {
  if (matchPattern(op, m_Constant())) {
    return true;
  }
  auto op_fusibility = dyn_cast<InferFusibilityOpInterface>(op);
  return op_fusibility && (op_fusibility.isFusibleWithOperand() ||
                           op_fusibility.isFusibleWithConsumer());
 }
 SmallVector<Value, 4> GetInputsOfFusionPattern(const FusionPattern& pattern) {
  SmallVector<Value, 4> inputs;
  DenseSet<Value> input_set;
  DenseSet<Operation*> op_set;
  for (Operation* op : pattern) {
    bool inserted = op_set.insert(op).second;
    (void)inserted;
    assert(inserted && "FusionPattern contains duplicate operations");
  }
  for (Operation* op : pattern) {
    for (Value operand : op->getOperands()) {
      Operation* operand_op = operand.getDefiningOp();
      if (op_set.find(operand_op) != op_set.end()) {
        // skip if defining op is in the pattern
        continue;
      }
      if (input_set.insert(operand).second) {
        inputs.push_back(operand);
      }
    }
  }
  return inputs;
 }
 SmallVector<Value, 4> GetOutputsOfFusionPattern(const FusionPattern& pattern) {
  SmallVector<Value, 4> outputs;
  DenseSet<Operation*> op_set;
  for (Operation* op : pattern) {
    bool inserted = op_set.insert(op).second;
    (void)inserted;
    assert(inserted && "FusionPattern contains duplicate operations");
  }
  for (Operation* op : pattern) {
    for (Value result : op->getResults()) {
      bool has_external_user = llvm::any_of(
          result.getUses(),
          [&](OpOperand& use) { return !op_set.count(use.getOwner()); });
      if (has_external_user) {
        outputs.push_back(result);
      }
    }
  }
  return outputs;
 }
 FusionPattern MergeFusionPattern(const FusionPattern& lhs,
                                 const FusionPattern& rhs) {
  FusionPattern pattern(lhs);
  pattern.insert(pattern.end(), rhs.begin(), rhs.end());
  return pattern;
 }
 inline int EffectiveSize(const FusionPattern& pattern) {
  return llvm::count_if(
      pattern, [](Operation* op) { return !matchPattern(op, m_Constant()); });
 }
 // This is an simple shape constraint analysis, which is used to
 // guide fusion decision (e.g. we only fuse shape-compatible ops).
 //
 // Currently, We only consider shape equality propagation based
 // on the shape constrain traits of elementwise ops (assuming that
 // implicit shape broadcast is forbidden).
 class ShapeConstraintAnalysis {
 public:
  explicit ShapeConstraintAnalysis(const SmallVectorImpl<Operation*>& op_list) {
    PropagateEquality(op_list);
  }
  // Returns true is `lhs` and `rhs` are supposed to have same shape.
  bool HasSameShape(Value lhs, Value rhs) {
    return impl_.isEquivalent(ValueWrapper(lhs), ValueWrapper(rhs));
  }
 private:
  // shape equality propagation based on the shape constrains of
  // elementwise ops.
  void PropagateEquality(const SmallVectorImpl<Operation*>& op_list) {
    bool converged = true;
    do {
      converged = true;
      auto update = [&](Value lhs, Value rhs) {
        if (!impl_.isEquivalent(ValueWrapper(lhs), ValueWrapper(rhs))) {
          converged = false;
          impl_.unionSets(ValueWrapper(lhs), ValueWrapper(rhs));
        }
      };
      for (Operation* op : op_list) {
        auto op_fusibility = dyn_cast<InferFusibilityOpInterface>(op);
        if (!op_fusibility) continue;
        int numInput = op->getNumOperands();
        int numOutput = op->getNumResults();
        // shape equality propagation between inputs.
        for (int input1 = 0; input1 < numInput; ++input1)
          for (int input2 = input1 + 1; input2 < numInput; ++input2)
            if (op_fusibility.inferInputsShapeEquality(input1, input2))
              update(op->getOperand(input1), op->getOperand(input2));
        // shape equality propagation between outputs.
        for (int output1 = 0; output1 < numOutput; ++output1)
          for (int output2 = output1 + 1; output2 < numOutput; ++output2)
            if (op_fusibility.inferOutputsShapeEquality(output1, output2))
              update(op->getResult(output1), op->getResult(output2));
        // shape equality propagation between input and output.
        for (int input = 0; input < numInput; ++input)
          for (int output = 0; output < numOutput; ++output)
            if (op_fusibility.inferInputOutputShapeEquality(input, output))
              update(op->getOperand(input), op->getResult(output));
      }
    } while (!converged);
  }
  // a UnionFind set
  EquivalenceClasses<ValueWrapper> impl_;
 };
 // A fusion planner that can propose a fusion plan for a block of ops.
 // The fusion plan is consisted of a group of fusion patterns.
 //
 // Currently all proposed patterns followed xla kLoop/kInput like fusion
 // templates while are adapted to the fully dynamic shape world.
 //
 // kLoop fusion template satifies:
 //   - all ops in the fusion pattern are element-wise.
 //   - all the shapes of outputs of fusion pattern are same, and thus can
 //     fit into a same parallel loop.
 //
 // kInput fusion template satifies:
 //   - any op in the fusion pattern is either element-wise or a reduction.
 //   - if a op is a reduction, its output cannot be consumered by other
 //     ops in the same fusion pattern.
 //   - all the effective shapes of outputs of fusion pattern are same.
 //     - For element-wise op, its effective shape is its output shape.
 //     - For reduction op, its effective shape is its operand shape.
 class FusionPlanner {
 public:
  explicit FusionPlanner(const SmallVectorImpl<Operation*>& op_list)
      : op_list_(op_list),
        shape_analysis_(op_list),
        cycle_detector_(op_list.size()) {
    BuildNodeMap();
  }
  // Returns a fusion plan if success, otherwise none.
  llvm::Optional<FusionPlan> Run() {
    // Greedily search connected fusible pattern, and ops belonging to
    // a same fusion pattern are grouped into a cluster.
    RunEdgeContractionLoop();
    // After doing edge contraction, each unique cluster having size
    // more than one represents a potential fusion pattern.
    // We collect all these clusters and construct a fusion plan.
    //
    // Note that the ops in a fusion pattern are in topological ordering.
    FusionPlan plan;
    DenseMap<int, int> pattern_ids;
    for (Operation* op : op_list_) {
      Cluster* cluster = GetClusterForNode(op);
      int node_id = cluster->cycles_graph_node_id();
      if (!IsFusible(op_list_[node_id]) ||
          EffectiveSize(GetClusterForNode(op)->fused_pattern()) <= 1) {
        continue;
      }
      if (!pattern_ids.count(node_id)) {
        int pattern_id = pattern_ids.size();
        pattern_ids[node_id] = pattern_id;
        plan.emplace_back();
      }
      plan[pattern_ids[node_id]].push_back(op);
    }
    return plan;
  }
  // Returns the op_list this planner operates on.
  const SmallVectorImpl<Operation*>& op_list() const { return op_list_; }
 private:
  // Represent a (partial) fused pattern
  class Cluster {
   public:
    Cluster(int node_id, FusionPlanner* planner) : node_id_(node_id) {
      const SmallVectorImpl<Operation*>& op_list = planner->op_list();
      pattern_.push_back(op_list[node_id]);
    }
    // Merges `other` into this cluster, and clears `other`.
    void Merge(Cluster* other) {
      pattern_.insert(pattern_.end(), other->pattern_.begin(),
                      other->pattern_.end());
      other->pattern_.clear();
    }
    // The number of nodes in this cluster.
    int cluster_size() const { return pattern_.size(); }
    // The ID of the cluster as represented in `cycle_detector_`.
    int cycles_graph_node_id() const { return node_id_; }
    // Sets the ID of the cluster as represented in `cycle_detector_`.
    void set_cycles_graph_node_id(int cycles_graph_node_id) {
      node_id_ = cycles_graph_node_id;
    }
    // Currently the fused pattern this cluster holds.
    const FusionPattern& fused_pattern() { return pattern_; }
   private:
    // ID of the representative node of this cluster.
    int node_id_;
    // the fused pattern this cluster holds.
    FusionPattern pattern_;
  };
 private:
  Cluster* MakeCluster(int cycles_graph_node_id) {
    cluster_storage_.emplace_back(new Cluster(cycles_graph_node_id, this));
    return cluster_storage_.back().get();
  }
  void BuildNodeMap() {
    int num_nodes = op_list_.size();
    for (int node_id = 0; node_id < num_nodes; ++node_id) {
      Operation* op = op_list_[node_id];
      MakeCluster(node_id);
      op_to_node_id_[op] = node_id;
      leader_for_node_.insert(node_id);
      for (Value operand : op->getOperands()) {
        Operation* operand_op = operand.getDefiningOp();
        if (operand_op == nullptr) {
          // skip block argument
          continue;
        }
        auto iter = op_to_node_id_.find(operand_op);
        assert(iter != op_to_node_id_.end());
        cycle_detector_.InsertEdge(iter->second, node_id);
      }
    }
  }
  // Returns the cluster contains this op.
  Cluster* GetClusterForNode(Operation* n) {
    int id = op_to_node_id_.at(n);
    id = leader_for_node_.getLeaderValue(id);
    return cluster_storage_[id].get();
  }
  // Returns the cluster contains the op having `node_id`.
  Cluster* GetClusterForCyclesGraphNode(int node_id) {
    return cluster_storage_[leader_for_node_.getLeaderValue(node_id)].get();
  }
  // Merges the clusters `cluster_from` and `cluster_to`.
  bool MergeClusters(Cluster* cluster_from, Cluster* cluster_to) {
    int from = cluster_from->cycles_graph_node_id();
    int to = cluster_to->cycles_graph_node_id();
    auto optional_merged_node = cycle_detector_.ContractEdge(from, to);
    if (!optional_merged_node.hasValue()) {
      llvm::dbgs() << "Could not contract " << from << " -> " << to
                   << " because contracting the edge would create a cycle.";
      return false;
    }
    // Merge the clusters.
    cluster_from->Merge(cluster_to);
    cluster_from->set_cycles_graph_node_id(*optional_merged_node);
    // Merge the UnionFind Set.
    leader_for_node_.unionSets(from, to);
    return true;
  }
  template <typename FnTy>
  bool ForEachEdgeInPostOrder(FnTy fn) {
    bool changed = false;
    for (int32_t node : cycle_detector_.AllNodesInPostOrder()) {
      Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
      // Make a copy of the set of successors because we may modify the graph in
      // TryToContractEdge.
      std::vector<int32_t> successors_copy =
          cycle_detector_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
      for (int to : successors_copy) {
        Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
        bool contracted_edge = fn(cluster_from, cluster_to);
        changed |= contracted_edge;
      }
    }
    return changed;
  }
  // returns the outputs if two cluster were merged
  SmallVector<Value, 4> GetResultsOfFusedPattern(Cluster* from, Cluster* to) {
    FusionPattern fused_pattern =
        MergeFusionPattern(from->fused_pattern(), to->fused_pattern());
    return GetOutputsOfFusionPattern(fused_pattern);
  }
  // This function check if fusing `from` with `to` is valid and if so perform
  // the merge. The validity is based on the operations in the clusters and
  // the compatibility of the shapes of the outputs of the would-be fused
  // clusters.
  // Returns true is the merge was performed.
  bool TryToContractEdge(Cluster* from, Cluster* to) {
    int node_to = to->cycles_graph_node_id();
    int node_from = from->cycles_graph_node_id();
    // Both node_to and node_from should be fusible
    if (!IsFusible(op_list_[node_to]) || !IsFusible(op_list_[node_from])) {
      return false;
    }
    auto op_from_fusibility =
        dyn_cast<InferFusibilityOpInterface>(op_list_[node_from]);
    if (op_from_fusibility && !op_from_fusibility.isFusibleWithConsumer()) {
      // This op cannot be fused with its consumers.
      return false;
    }
    auto op_to_fusibility =
        dyn_cast<InferFusibilityOpInterface>(op_list_[node_to]);
    if (op_to_fusibility && !op_to_fusibility.isFusibleWithOperand()) {
      // This op cannot be fused with its operands.
      return false;
    }
    // Output shapes of a fusion pattern should be compatible as described in
    // the document of this class.
    SmallVector<Value, 4> results = GetResultsOfFusedPattern(from, to);
    auto get_workload_shape = [](Value v) {
      Operation* op = v.getDefiningOp();
      // Block argument
      if (!op) return v;
      auto op_fusibility = dyn_cast<InferFusibilityOpInterface>(op);
      // Const value
      if (!op_fusibility) return v;
      llvm::Optional<Value> workload =
          op_fusibility.inferEffectiveWorkloadShape();
      return workload.hasValue() ? *workload : v;
    };
    Value ref = get_workload_shape(results[0]);
    if (!llvm::all_of(results, [&](Value result) {
          Value val = get_workload_shape(result);
          return shape_analysis_.HasSameShape(ref, val);
        })) {
      return false;
    }
    return MergeClusters(from, to);
  }
  // Greedily fuse connected node.
  bool RunEdgeContractionLoop() {
    using std::placeholders::_1;
    using std::placeholders::_2;
    return ForEachEdgeInPostOrder(
        std::bind(&FusionPlanner::TryToContractEdge, this, _1, _2));
  }
  const SmallVectorImpl<Operation*>& op_list_;
  // Shape equality checker
  ShapeConstraintAnalysis shape_analysis_;
  // op -> node_id
  std::unordered_map<Operation*, int> op_to_node_id_;
  // make sure not introduce cycle after fusion
  GraphCycles cycle_detector_;
  std::vector<std::unique_ptr<Cluster>> cluster_storage_;
  // a UnionFind set. Each set represents a (partial) fused pattern
  // and has a leader as representation.
  EquivalenceClasses<int32_t> leader_for_node_;
 };
 struct XlaHloFusion : public mlir::PassWrapper<XlaHloFusion, FunctionPass> {
  void runOnFunction() override {
    FuncOp func = getFunction();
    if (!IsTargetFunc(func)) {
      return;
    }
    // process each block and do fusion within a block.
    for (Block& block : func) {
      SmallVector<Operation*, 4> op_list;
      for (Operation& op : block) {
        op_list.push_back(&op);
      }
      FusionPlanner planner(op_list);
      llvm::Optional<FusionPlan> plan = planner.Run();
      if (!plan) {
        emitError(func.getLoc(), "can't find a fusion plan");
        signalPassFailure();
        return;
      }
      if (!ApplyFusionPlan(*plan)) {
        emitError(func.getLoc(), "apply fusion plan failed");
        signalPassFailure();
        return;
      }
    }
  }
  bool IsTargetFunc(FuncOp func) {
    int num_fusible_ops = 0;
    bool is_target_func = false;
    // We only process the function having enough candidates
    func.walk([&](Operation* op) {
      num_fusible_ops +=
          static_cast<int>(dyn_cast<InferFusibilityOpInterface>(op) != nullptr);
      is_target_func = (num_fusible_ops > 1);
      // early stop
      if (is_target_func) return WalkResult::interrupt();
      return WalkResult::advance();
    });
    return is_target_func;
  }
  bool ApplyFusionPlan(const FusionPlan& plan) {
    for (const FusionPattern& pattern : plan) {
      OpBuilder b(pattern.back());
      SmallVector<Location, 4> locations;
      locations.reserve(pattern.size());
      for (Operation* op : pattern) {
        locations.push_back(op->getLoc());
      }
      Location fused_loc =
          FusedLoc::get(locations, pattern.back()->getContext());
      SmallVector<Value, 4> inputs = GetInputsOfFusionPattern(pattern);
      SmallVector<Value, 4> outputs = GetOutputsOfFusionPattern(pattern);
      SmallVector<Type, 4> output_types;
      output_types.reserve(outputs.size());
      for (Value v : outputs) {
        output_types.push_back(v.getType());
      }
      FusionOp fusion =
          b.create<xla_hlo::FusionOp>(fused_loc, output_types, inputs);
      Region& region = fusion.fused_computation();
      region.push_back(new Block);
      Block& block = region.front();
      for (Operation* op : pattern) {
        op->moveBefore(&block, block.end());
      }
      b.setInsertionPoint(&block, block.end());
      b.create<xla_hlo::ReturnOp>(fused_loc, outputs);
      for (auto output_and_result : llvm::zip(outputs, fusion.getResults())) {
        Value output = std::get<0>(output_and_result);
        Value fusion_result = std::get<1>(output_and_result);
        for (OpOperand& use : llvm::make_early_inc_range(output.getUses())) {
          if (use.getOwner()->getBlock() != &block) use.set(fusion_result);
        }
      }
    }
    return true;
  }
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createXlaHloFusion() {
  return std::make_unique<XlaHloFusion>();
 }
 static PassRegistration<XlaHloFusion> xla_hlo_fusion_pass(
    "xla-hlo-fusion", "fuse xla_hlo ops to kLoop/kInput fusion patterns.");
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/xla_legalize_to_linalg.cc
+++ b/lib/Dialect/mhlo/transforms/xla_legalize_to_linalg.cc
@ -0,0 +1,909 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/AffineExpr.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Attributes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Builders.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Location.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_xla_to_scalar_op.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace {
 SmallVector<StringRef, 3> GetNParallelLoopsAttrs(unsigned nParallelLoops) {
  static constexpr StringRef kParallelIterType = "parallel";
  return SmallVector<StringRef, 3>(nParallelLoops, kParallelIterType);
 }
 template <bool isLHLO = true>
 Value getResultValue(Operation* op) {
  return isLHLO ? op->getOperand(op->getNumOperands() - 1) : op->getResult(0);
 }
 template <bool isLHLO = true>
 ShapedType getXLAOpResultType(Operation* op) {
  return getResultValue<isLHLO>(op).getType().template cast<ShapedType>();
 }
 template <bool isLHLO = true>
 bool verifyXLAOpBufferOrTensorSemantics(Operation* op) {
  auto verifyType = [&](Value val) -> bool {
    return (isLHLO && val.getType().isa<MemRefType>()) ||
           (!isLHLO && val.getType().isa<RankedTensorType>());
  };
  if (!llvm::all_of(op->getOperands(), verifyType)) return false;
  return isLHLO ? op->getResults().empty()
                : llvm::all_of(op->getResults(), verifyType);
 }
 template <typename OpTy, bool isLHLO = true>
 class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
 public:
  using OpConversionPattern<OpTy>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      OpTy op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = op.getLoc();
    auto argType =
        op.getOperation()->getOperand(0).getType().template cast<ShapedType>();
    if (!argType.hasRank()) {
      emitError(loc, "lhlo to linalg conversion expects ranked args");
      return failure();
    }
    auto elemTy = argType.getElementType();
    if (!elemTy.isSignlessIntOrFloat() && !elemTy.template isa<ComplexType>()) {
      return failure();
    }
    // Construct the indexing maps needed for linalg.generic ops.
    SmallVector<AffineMap, 2> indexing_maps;
    SmallVector<Type, 4> bodyArgTypes, bodyResultTypes, opResultTypes;
    // This doesnt account for implicit broadcast, but the working assumption
    // here is that are broadcasts have been made explicit.
    unsigned nloops = argType.getRank();
    if (isLHLO && !nloops) return failure();
    int operandCount = (isLHLO ? args.size() - 1 : args.size());
    auto verifyArgOrResultType = [&](Value val) -> ShapedType {
      auto shapedType = val.getType().dyn_cast<ShapedType>();
      if (!shapedType ||
          (!shapedType.isa<MemRefType>() &&
           !shapedType.isa<RankedTensorType>()) ||
          shapedType.getRank() != nloops)
        return nullptr;
      indexing_maps.emplace_back(
          nloops ? rewriter.getMultiDimIdentityMap(nloops)
                 : AffineMap::get(nloops, 0, rewriter.getContext()));
      return shapedType;
    };
    for (const auto& arg : llvm::enumerate(args)) {
      auto shapedType = verifyArgOrResultType(arg.value());
      if (!shapedType) return failure();
      auto& result_or_body_arg =
          arg.index() < operandCount ? bodyArgTypes : bodyResultTypes;
      result_or_body_arg.emplace_back(shapedType.getElementType());
    }
    if (!isLHLO) {
      // HLO operations have return as tensor types.
      assert(bodyResultTypes.empty() &&
             "When lowering HLO ops result can't be part of arguments");
      Value result = op.getOperation()->getResult(0);
      auto shapedType = verifyArgOrResultType(result);
      if (!shapedType) return failure();
      bodyResultTypes.push_back(shapedType.getElementType());
      opResultTypes.push_back(shapedType);
    }
    int64_t args_count = bodyArgTypes.size();
    int64_t results_count = bodyResultTypes.size();
    auto linalgOp = rewriter.create<linalg::GenericOp>(
        loc, opResultTypes, args, args_count, results_count, indexing_maps,
        GetNParallelLoopsAttrs(nloops),
        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
          // TODO(ravishankarm) : For now use the method in xla_lhlo namespace.
          // That method needs to be moved out of there.
          Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<OpTy>(
              op, bodyResultTypes,
              llvm::to_vector<2>(args.take_front(args_count)), &rewriter);
          nestedBuilder.create<linalg::YieldOp>(loc, opResult);
        });
    rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
    return success();
  }
 };
 template <typename LhloOp>
 class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
 public:
  using OpConversionPattern<LhloOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      LhloOp lhlo_op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = lhlo_op.getLoc();
    auto argType =
        lhlo_op.getOperand(0).getType().template dyn_cast<ShapedType>();
    if (!argType || !argType.getElementType().isSignlessIntOrFloat() ||
        (argType.getRank() != 0)) {
      return failure();
    }
    // Create two loads from the input.
    auto lhs = rewriter.create<LoadOp>(loc, lhlo_op.lhs());
    auto rhs = rewriter.create<LoadOp>(loc, lhlo_op.rhs());
    // TODO(ravishankarm) : Move this method out of xla_lhlo namespace.
    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOp>(
        lhlo_op, argType.getElementType(), llvm::ArrayRef<Value>{lhs, rhs},
        &rewriter);
    rewriter.create<StoreOp>(loc, opResult, lhlo_op.out());
    rewriter.eraseOp(lhlo_op);
    return success();
  }
 };
 //===----------------------------------------------------------------------===//
 // xla_lhlo.convolution conversion pattern.
 //===----------------------------------------------------------------------===//
 /// Converts xla_lhlo.convolution operation to a linalg.conv op.
 struct ConvToLinalgConverter : public OpConversionPattern<xla_lhlo::ConvOp> {
 public:
  using OpConversionPattern<xla_lhlo::ConvOp>::OpConversionPattern;
  //  This code has been adapted from IREE's
  //  (https://github.com/google/iree/) xla_hlo -> linalg conversion.
  LogicalResult matchAndRewrite(
      xla_lhlo::ConvOp op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    // Check validity of dimension information.
    if (const xla_lhlo::ConvDimensionNumbers& dimensionNumbers =
            op.dimension_numbers()) {
      const int inputSpatialRank =
          llvm::size(dimensionNumbers.input_spatial_dimensions());
      // The dimensions for input should follow the order of
      // batch_count, spatial_dims..., input_feature_count.
      if (dimensionNumbers.input_batch_dimension().getInt() != 0 ||
          dimensionNumbers.input_feature_dimension().getInt() !=
              (inputSpatialRank + 1))
        return failure();
      const int kernelSpatialRank =
          llvm::size(dimensionNumbers.kernel_spatial_dimensions());
      // The dimensions for filter should follow the order of
      // spatial_dims..., input_feature_count, num_output_feature_count.
      if (dimensionNumbers.kernel_input_feature_dimension().getInt() !=
              kernelSpatialRank ||
          dimensionNumbers.kernel_output_feature_dimension().getInt() !=
              (kernelSpatialRank + 1))
        return failure();
      const int outputSpatialRank =
          llvm::size(dimensionNumbers.output_spatial_dimensions());
      // The dimensions for output should follow the order of
      // batch_count, spatial_dims.., output_feature_count.
      if (dimensionNumbers.output_batch_dimension().getInt() != 0 ||
          dimensionNumbers.output_feature_dimension().getInt() !=
              (outputSpatialRank + 1))
        return failure();
      if (inputSpatialRank != outputSpatialRank ||
          inputSpatialRank != kernelSpatialRank)
        return failure();
      auto inputSpatialDim =
          dimensionNumbers.input_spatial_dimensions().begin();
      auto kernelSpatialDim =
          dimensionNumbers.kernel_spatial_dimensions().begin();
      auto outputSpatialDim =
          dimensionNumbers.output_spatial_dimensions().begin();
      // Check if spatial dims are ordered correctly.
      for (int i = 0; i < inputSpatialRank; ++i) {
        const int dim = i + 1;
        if ((*inputSpatialDim++).getZExtValue() != dim ||
            (*outputSpatialDim++).getZExtValue() != dim ||
            (*kernelSpatialDim++).getZExtValue() != i)
          return failure();
      }
    }
    // TODO: LHS dilation for deconvolution not supported yet.
    if (op.lhs_dilation()) {
      return failure();
    }
    llvm::SmallVector<Attribute, 4> strides;
    if (auto windowStrides = op.window_strides()) {
      auto range = windowStrides->getAttributeValues();
      strides.assign(range.begin(), range.end());
    }
    auto stridesArg = ArrayAttr::get(strides, op.getContext());
    llvm::SmallVector<Attribute, 2> dilation;
    if (auto rhsDilation = op.rhs_dilation()) {
      auto range = rhsDilation->getAttributeValues();
      dilation.assign(range.begin(), range.end());
    } else {
      // Default dilation of 1.
      dilation.resize(2, IntegerAttr::get(rewriter.getIntegerType(64), 1));
    }
    auto dilationArg = ArrayAttr::get(dilation, op.getContext());
    // Set padding only if it is non-zero.
    DenseIntElementsAttr padding = op.paddingAttr();
    if (!padding || !llvm::any_of(padding.getValues<APInt>(), [](APInt intVal) {
          return !intVal.isNullValue();
        })) {
      padding = nullptr;
    }
    // The order of input and filter are switched with linalg.conv.
    rewriter.replaceOpWithNewOp<linalg::ConvOp>(
        op, args[1], args[0], args[2], stridesArg, dilationArg, padding);
    return success();
  }
 };
 /// Base class for lowering xla operations that have one operand and one result,
 /// and are semantically equivalent to a copy of the input to the output (like
 /// transpose, some reshape, etc.). The derived classes need to provide a method
 /// `getIndexingMaps` that returns AffineMaps for the index maps of the input
 /// and the output.
 template <typename Derived, typename OpTy, bool isLHLO = true>
 class DataMovementOpConverter : public OpConversionPattern<OpTy> {
 public:
  using OpConversionPattern<OpTy>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      OpTy op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
    auto resultType = getXLAOpResultType<isLHLO>(op);
    SmallVector<AffineMap, 2> indexing_maps =
        Derived::getIndexingMaps(op, &rewriter);
    if (indexing_maps.empty()) return failure();
    auto nloops = resultType.getRank();
    auto loc = op.getLoc();
    auto linalgOp = rewriter.create<linalg::GenericOp>(
        loc, isLHLO ? ArrayRef<Type>{} : resultType, args, /*inputCount=*/1,
        /*outputCount=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops),
        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
          nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
        });
    rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
    return success();
  }
 };
 /// Pattern to convert BroadcastOp to Linalg ops.
 template <typename OpTy, bool isLHLO = true>
 class BroadcastConverter
    : public DataMovementOpConverter<BroadcastConverter<OpTy, isLHLO>, OpTy,
                                     isLHLO> {
 public:
  using DataMovementOpConverter<BroadcastConverter, OpTy,
                                isLHLO>::DataMovementOpConverter;
  static SmallVector<AffineMap, 2> getIndexingMaps(OpTy broadcastOp,
                                                   Builder* b) {
    ShapedType inputType =
        broadcastOp.operand().getType().template cast<ShapedType>();
    unsigned inputRank = inputType.getRank();
    unsigned nloops = getXLAOpResultType<isLHLO>(broadcastOp).getRank();
    // BroadcastOp prepends the dimensions in the `broadcast_sizes` attribute to
    // the input's dimensions.
    unsigned numPrependedDims = llvm::size(broadcastOp.broadcast_sizes());
    SmallVector<AffineExpr, 4> inputDimExprs;
    inputDimExprs.reserve(inputRank);
    for (int i = 0; i < inputRank; ++i) {
      inputDimExprs.push_back(b->getAffineDimExpr(numPrependedDims + i));
    }
    AffineMap inputMap;
    MLIRContext* context = b->getContext();
    if (inputDimExprs.empty()) {
      // The input is a scalar, i.e. this is a scalar broadcast op.
      inputMap = AffineMap::get(nloops, /*symbolCount=*/0, context);
    } else {
      inputMap =
          AffineMap::get(nloops, /*symbolCount=*/0, inputDimExprs, context);
    }
    return {inputMap, b->getMultiDimIdentityMap(nloops)};
  }
 };
 class HloBroadcastInDimConverter
    : public DataMovementOpConverter<HloBroadcastInDimConverter,
                                     xla_hlo::BroadcastInDimOp, false> {
 public:
  using DataMovementOpConverter<HloBroadcastInDimConverter,
                                xla_hlo::BroadcastInDimOp,
                                false>::DataMovementOpConverter;
  static SmallVector<AffineMap, 2> getIndexingMaps(
      xla_hlo::BroadcastInDimOp broadcastOp, Builder* b) {
    auto resultType = getXLAOpResultType<false>(broadcastOp);
    auto operandType =
        broadcastOp.operand().getType().template cast<ShapedType>();
    unsigned nloops = resultType.getRank();
    // The input is a scalar, i.e. this is a scalar broadcast op.
    if (operandType.getRank() == 0) {
      return {AffineMap::get(nloops, /*symbolCount=*/0, b->getContext()),
              b->getMultiDimIdentityMap(nloops)};
    }
    auto operandShape = operandType.getShape();
    SmallVector<AffineExpr, 4> dimExprs;
    dimExprs.reserve(nloops);
    if (broadcastOp.broadcast_dimensions()) {
      for (const auto& broadcastDim :
           enumerate(broadcastOp.broadcast_dimensions().getIntValues())) {
        int size = broadcastDim.value().getSExtValue();
        bool expansion_needed = operandShape[broadcastDim.index()] == 1 &&
                                resultType.getShape()[size] != 1;
        dimExprs.push_back(expansion_needed ? b->getAffineConstantExpr(0)
                                            : b->getAffineDimExpr(size));
      }
    }
    return {
        AffineMap::get(nloops, /*symbolCount=*/0, dimExprs, b->getContext()),
        b->getMultiDimIdentityMap(nloops)};
  }
 };
 class LhloBroadcastInDimConverter
    : public OpConversionPattern<xla_lhlo::BroadcastInDimOp> {
 public:
  using OpConversionPattern<xla_lhlo::BroadcastInDimOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::BroadcastInDimOp op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    xla_lhlo::BroadcastInDimOp::Adaptor operand_adaptor(args);
    auto result_type = operand_adaptor.output().getType().cast<MemRefType>();
    auto result_shape = result_type.getShape();
    auto operand_and_dims = InsertReshapeIfNecessary(op, args, rewriter);
    Value operand = std::get<0>(operand_and_dims);
    auto broadcast_dims = std::get<1>(operand_and_dims);
    auto loc = op.getLoc();
    auto nloops = result_type.getRank();
    auto operand_type = operand.getType().cast<MemRefType>();
    // For a degenerate case, i.e. broadcasting with expansion of
    // memref<1xELEMENT_TYPE>, the operand is not passed to `linalg.generic`.
    // Instead the value is loaded and used directly in `linalg.yield`.
    if (operand_type.getRank() == 1 &&
        operand_type.getDimSize(0) <
            result_type.getDimSize(broadcast_dims.front())) {
      Value zero = rewriter.create<ConstantIndexOp>(loc, 0);
      Value val =
          rewriter.create<LoadOp>(loc, operand, llvm::makeArrayRef({zero}));
      rewriter.create<linalg::GenericOp>(
          loc, llvm::None, llvm::makeArrayRef(operand_adaptor.output()),
          /*inputCount=*/0, /*outputCount=*/1,
          llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
          GetNParallelLoopsAttrs(nloops),
          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
            nestedBuilder.create<linalg::YieldOp>(loc, val);
          });
    } else {
      auto indexing_maps = getIndexingMaps(op, broadcast_dims, result_shape,
                                           operand_type, &rewriter);
      rewriter.create<linalg::GenericOp>(
          loc, llvm::None,
          llvm::makeArrayRef({operand, operand_adaptor.output()}),
          /*inputCount=*/1, /*outputCount=*/1, indexing_maps,
          GetNParallelLoopsAttrs(nloops),
          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
            nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
          });
    }
    rewriter.replaceOp(op, llvm::None);
    return success();
  }
  // Inserts 'linalg.reshape' if there is a size-1 dim expansion.
  std::pair<Value, SmallVector<int64_t, 2>> InsertReshapeIfNecessary(
      xla_lhlo::BroadcastInDimOp op, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const {
    xla_lhlo::BroadcastInDimOp::Adaptor operand_adaptor(args);
    Value operand = operand_adaptor.operand();
    auto operand_type = operand_adaptor.operand().getType().cast<MemRefType>();
    auto operand_shape = operand_type.getShape();
    Value result = operand_adaptor.output();
    auto result_type = result.getType().cast<MemRefType>();
    auto result_shape = result_type.getShape();
    SmallVector<int64_t, 2> operand_strides;
    int64_t operand_offset;
    if (failed(getStridesAndOffset(operand_type, operand_strides,
                                   operand_offset))) {
      op.emitOpError() << "Failed to get offset and strides.";
    }
    SmallVector<int64_t, 2> new_shape, new_strides, broadcast_dims;
    SmallVector<linalg::ReassociationIndices, 4> collapsed_dims_list;
    linalg::ReassociationIndices collapsed_dims;
    for (const auto& item :
         enumerate(op.broadcast_dimensions().getIntValues())) {
      size_t index = item.index();
      int dim = item.value().getSExtValue();
      collapsed_dims.push_back(index);
      bool expansion_needed =
          operand_shape[index] == 1 && result_shape[dim] != 1;
      if (expansion_needed) {
        continue;
      }
      new_shape.push_back(operand_shape[index]);
      new_strides.push_back(operand_strides[index]);
      broadcast_dims.push_back(dim);
      collapsed_dims_list.push_back(collapsed_dims);
      collapsed_dims.clear();
    }
    // If `collapsed_dims_list` is empty, then the memref has shape [1, ..., 1]
    // and all dimensions need expansion. Such memref will be reshaped to a 1D
    // memref with a single element. New shape and strides needs to be updated
    // accordingly.
    if (collapsed_dims_list.empty()) {
      collapsed_dims_list.push_back({});
      new_shape.push_back(1);
      new_strides.push_back(1);
      broadcast_dims.push_back(0);
    }
    for (const auto& dims : collapsed_dims) {
      collapsed_dims_list.back().push_back(dims);
    }
    // `linalg.reshape` is inserted only if necessary, i.e. when the rank can be
    // reduced.
    if (new_shape.size() < operand_shape.size()) {
      auto new_memref_type = MemRefType::get(
          new_shape, operand_type.getElementType(),
          makeStridedLinearLayoutMap(new_strides, operand_offset,
                                     rewriter.getContext()));
      operand = rewriter.create<linalg::ReshapeOp>(op.getLoc(), new_memref_type,
                                                   operand_adaptor.operand(),
                                                   collapsed_dims_list);
    }
    return std::make_pair(operand, broadcast_dims);
  }
  SmallVector<AffineMap, 2> getIndexingMaps(xla_lhlo::BroadcastInDimOp op,
                                            ArrayRef<int64_t> broadcastDims,
                                            ArrayRef<int64_t> resultShape,
                                            MemRefType operandType,
                                            Builder* b) const {
    unsigned nloops = resultShape.size();
    // The input is a scalar, i.e. this is a scalar broadcast op.
    if (operandType.getRank() == 0) {
      return {AffineMap::get(nloops, /*symbolCount=*/0, b->getContext()),
              b->getMultiDimIdentityMap(nloops)};
    }
    auto operandShape = operandType.getShape();
    SmallVector<AffineExpr, 4> dimExprs;
    dimExprs.reserve(nloops);
    for (const auto& broadcastDim : llvm::enumerate(broadcastDims)) {
      int size = broadcastDim.value();
      bool expansion_needed =
          operandShape[broadcastDim.index()] == 1 && resultShape[size] != 1;
      if (expansion_needed) {
        op.emitOpError(
            "BroadcastInDimOp lowering to Linalg does not support size-1 "
            "dimensions expansion.");
      }
      dimExprs.push_back(b->getAffineDimExpr(size));
    }
    return {
        AffineMap::get(nloops, /*symbolCount=*/0, dimExprs, b->getContext()),
        b->getMultiDimIdentityMap(nloops)};
  }
 };
 template <typename OpTy, bool isLHLO = true>
 class TransposeConverter
    : public DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
                                     isLHLO> {
 public:
  using DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
                                isLHLO>::DataMovementOpConverter;
  static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
    auto resultType =
        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
    auto nloops = resultType.getRank();
    SmallVector<AffineExpr, 2> inputExprs;
    inputExprs.resize(resultType.getRank());
    for (auto permutation : llvm::enumerate(op.permutation())) {
      inputExprs[permutation.value().getZExtValue()] =
          b->getAffineDimExpr(permutation.index());
    }
    return {
        AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
        b->getMultiDimIdentityMap(nloops)};
  }
 };
 // Converts reshape ops that can be proven to be either a collapse of dimensions
 // or expansion of dimensions of the operand.
 template <typename OpTy, bool isLHLO = true>
 class ReshapeOpConverter : public OpConversionPattern<OpTy> {
 public:
  using OpConversionPattern<OpTy>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      OpTy reshapeOp, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(reshapeOp))
      return failure();
    ShapedType operandType =
        reshapeOp.operand().getType().template cast<ShapedType>();
    ShapedType resultType = getXLAOpResultType<isLHLO>(reshapeOp);
    if (!operandType.hasStaticShape() || !resultType.hasStaticShape())
      return failure();
    // Compute the reassociation maps for the linalg operation.
    ArrayRef<int64_t> srcShape =
        (operandType.getRank() > resultType.getRank() ? operandType.getShape()
                                                      : resultType.getShape());
    ArrayRef<int64_t> dstShape =
        (operandType.getRank() > resultType.getRank() ? resultType.getShape()
                                                      : operandType.getShape());
    unsigned currSrcDim = 0, currDstDim = 0;
    SmallVector<linalg::ReassociationExprs, 4> reassociationMap(
        dstShape.size());
    while (currSrcDim < srcShape.size() && currDstDim < dstShape.size()) {
      int64_t dstSize = dstShape[currDstDim];
      int64_t srcSize = srcShape[currSrcDim];
      while (srcSize < dstSize && currSrcDim < srcShape.size()) {
        reassociationMap[currDstDim].push_back(
            rewriter.getAffineDimExpr(currSrcDim++));
        srcSize *= srcShape[currSrcDim];
      }
      if (srcSize == dstSize) {
        reassociationMap[currDstDim].push_back(
            rewriter.getAffineDimExpr(currSrcDim++));
        // If the next dim in dstShape is not 1, treat subsequent dims in
        // srcShape which are 1 to be collapsed.
        if (currDstDim == dstShape.size() - 1 ||
            dstShape[currDstDim + 1] != 1) {
          while (currSrcDim < srcShape.size() && srcShape[currSrcDim] == 1) {
            reassociationMap[currDstDim].push_back(
                rewriter.getAffineDimExpr(currSrcDim++));
          }
        }
      } else {
        return failure();
      }
      currDstDim++;
    }
    if (currSrcDim != srcShape.size()) return failure();
    if (isLHLO) {
      Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
          reshapeOp.getLoc(), resultType, args[0], reassociationMap);
      rewriter.replaceOpWithNewOp<linalg::CopyOp>(
          reshapeOp, reshapeBuffer, args[1], /*inputPermutation =*/nullptr,
          /*outputPermutation =*/nullptr);
    } else {
      rewriter.replaceOpWithNewOp<linalg::TensorReshapeOp>(
          reshapeOp, resultType, args[0], reassociationMap);
    }
    return success();
  }
 };
 class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
 public:
  using OpConversionPattern<xla_lhlo::IotaOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::IotaOp iotaOp, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    auto resultMemrefType =
        iotaOp.getOperand().getType().dyn_cast<MemRefType>();
    if (!resultMemrefType) return failure();
    auto resultElementType = resultMemrefType.getElementType();
    if (!resultElementType.isSignlessIntOrFloat()) return failure();
    // Construct the indexing maps needed for linalg.generic ops.
    unsigned nloops = resultMemrefType.getRank();
    rewriter.create<linalg::IndexedGenericOp>(
        iotaOp.getLoc(), ArrayRef<Type>{}, args,
        0,  // args_in
        1,  // args_out
        llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
        GetNParallelLoopsAttrs(nloops),
        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange ivs,
            ValueRange args) {
          Value castOp = nestedBuilder.create<IndexCastOp>(
              nestedLoc, ivs[iotaOp.iota_dimension().getZExtValue()],
              nestedBuilder.getIntegerType(
                  resultElementType.getIntOrFloatBitWidth()));
          if (resultElementType.isa<FloatType>()) {
            castOp = nestedBuilder.create<SIToFPOp>(nestedLoc, castOp,
                                                    resultElementType);
          }
          nestedBuilder.create<linalg::YieldOp>(nestedLoc, castOp);
        });
    rewriter.replaceOp(iotaOp, llvm::None);
    return success();
  }
 };
 class ConstConverter : public OpConversionPattern<xla_lhlo::ConstOp> {
 public:
  using OpConversionPattern<xla_lhlo::ConstOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::ConstOp constOp, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = constOp.getLoc();
    auto valueAttr = constOp.value().cast<DenseElementsAttr>();
    if (valueAttr.getType().getRank() != 0) return failure();
    auto stdConstOp =
        rewriter.create<mlir::ConstantOp>(loc, valueAttr.getValue({}));
    rewriter.create<mlir::StoreOp>(loc, stdConstOp, constOp.getOperand());
    rewriter.eraseOp(constOp);
    return success();
  }
 };
 // TODO(b/156787842): Support the lowering for dynamic shapes.
 template <typename OpTy, bool isLHLO = true>
 class ReverseConverter
    : public DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
                                     isLHLO> {
 public:
  using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
                                isLHLO>::DataMovementOpConverter;
  static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
    auto resultType =
        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
    auto nloops = resultType.getRank();
    SmallVector<AffineExpr, 2> inputExprs;
    inputExprs.reserve(nloops);
    for (int i = 0; i < nloops; ++i)
      inputExprs.push_back(b->getAffineDimExpr(i));
    for (auto dim : op.dimensions()) {
      int i = dim.getZExtValue();
      if (resultType.isDynamicDim(i)) return {};
      int n = resultType.getShape()[i];
      inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i];
    }
    return {
        AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
        b->getMultiDimIdentityMap(nloops)};
  }
 };
 class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
 public:
  using OpConversionPattern<xla_lhlo::SliceOp>::OpConversionPattern;
  LogicalResult matchAndRewrite(
      xla_lhlo::SliceOp sliceOp, ArrayRef<Value> args,
      ConversionPatternRewriter& rewriter) const final {
    auto loc = sliceOp.getLoc();
    auto argType =
        sliceOp.getOperand(0).getType().template dyn_cast<ShapedType>();
    if (!argType || !argType.hasRank()) {
      emitError(loc, "lhlo to linalg conversion expects known-rank args");
      return failure();
    }
    SmallVector<Value, 3> ranges;
    for (int i = 0, e = argType.getRank(); i < e; ++i) {
      Value start_index = rewriter.create<ConstantIndexOp>(
          loc, sliceOp.start_indices().getValue<int64_t>(i));
      Value limit_index = rewriter.create<ConstantIndexOp>(
          loc, sliceOp.limit_indices().getValue<int64_t>(i));
      Value stride = rewriter.create<ConstantIndexOp>(
          loc, sliceOp.strides().getValue<int64_t>(i));
      ranges.push_back(rewriter.create<linalg::RangeOp>(loc, start_index,
                                                        limit_index, stride));
    }
    auto linalg_slice =
        rewriter.create<linalg::SliceOp>(loc, sliceOp.getOperand(0), ranges);
    rewriter.create<linalg::CopyOp>(loc, linalg_slice, sliceOp.getOperand(1));
    rewriter.eraseOp(sliceOp);
    return success();
  }
 };
 void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                                           OwningRewritePatternList* patterns) {
  // clang-format off
  patterns->insert<BroadcastConverter<xla_lhlo::BroadcastOp>,
                   ConstConverter,
                   ConvToLinalgConverter,
                   IotaConverter,
                   LhloBroadcastInDimConverter,
                   PointwiseToLinalgConverter<xla_lhlo::AbsOp>,
                   PointwiseToLinalgConverter<xla_lhlo::AddOp>,
                   PointwiseToLinalgConverter<xla_lhlo::AndOp>,
                   PointwiseToLinalgConverter<xla_lhlo::CeilOp>,
                   PointwiseToLinalgConverter<xla_lhlo::CompareOp>,
                   PointwiseToLinalgConverter<xla_lhlo::ComplexOp>,
                   PointwiseToLinalgConverter<xla_lhlo::ConvertOp>,
                   // TODO(ataei): Remove this pattern, CopyOp is folded away.
                   PointwiseToLinalgConverter<xla_lhlo::CopyOp>,
                   PointwiseToLinalgConverter<xla_lhlo::CosOp>,
                   PointwiseToLinalgConverter<xla_lhlo::DivOp>,
                   PointwiseToLinalgConverter<xla_lhlo::ExpOp>,
                   PointwiseToLinalgConverter<xla_lhlo::ImagOp>,
                   PointwiseToLinalgConverter<xla_lhlo::LogOp>,
                   PointwiseToLinalgConverter<xla_lhlo::MaxOp>,
                   PointwiseToLinalgConverter<xla_lhlo::MinOp>,
                   PointwiseToLinalgConverter<xla_lhlo::MulOp>,
                   PointwiseToLinalgConverter<xla_lhlo::NegOp>,
                   PointwiseToLinalgConverter<xla_lhlo::RealOp>,
                   PointwiseToLinalgConverter<xla_lhlo::RemOp>,
                   PointwiseToLinalgConverter<xla_lhlo::RsqrtOp>,
                   PointwiseToLinalgConverter<xla_lhlo::SelectOp>,
                   PointwiseToLinalgConverter<xla_lhlo::SignOp>,
                   PointwiseToLinalgConverter<xla_lhlo::SinOp>,
                   PointwiseToLinalgConverter<xla_lhlo::SqrtOp>,
                   PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                   PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
                   ReshapeOpConverter<xla_lhlo::ReshapeOp>,
                   ReverseConverter<xla_lhlo::ReverseOp>,
                   ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                   SliceConverter
                  >(context);
  // clang-format on
 }
 // Converts LHLO ops to Linalg generic.
 // Sample result for xla_lhlo::AddOp.
 //
 // "xla_lhlo.add"(%arg1, %arg2, %out) :
 //      (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //
 // will be converted to
 //
 // #map0 = (d0, d1) -> (d0, d1)
 // "linalg.generic"(%arg1, %arg2, %out) ( {
 //   ^bb0(%arg4: f32, %arg5: f32):
 //     %0 = addf %arg4, %arg5 : f32
 //     "linalg.yield"(%0) : (f32) -> ()
 // }) {
 //     args_in = 2,
 //     args_out = 1,
 //     indexing_maps = [#map0, #map0, #map0],
 //     iterator_types = ["parallel", "parallel"],
 // } : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 struct LhloLegalizeToLinalg
    : public PassWrapper<LhloLegalizeToLinalg, FunctionPass> {
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    ConversionTarget target(getContext());
    target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect>();
    auto func = getFunction();
    populateLHLOToLinalgConversionPattern(func.getContext(), &patterns);
    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
      signalPassFailure();
    }
  }
 };
 struct HloLegalizeToLinalg
    : public PassWrapper<HloLegalizeToLinalg, FunctionPass> {
  void runOnFunction() override {
    OwningRewritePatternList patterns;
    ConversionTarget target(getContext());
    target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect>();
    auto func = getFunction();
    xla_hlo::populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
      signalPassFailure();
    }
  }
 };
 }  // namespace
 namespace xla_lhlo {
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToLinalgPass() {
  return absl::make_unique<LhloLegalizeToLinalg>();
 }
 static PassRegistration<LhloLegalizeToLinalg> legalize_lhlo_pass(
    "lhlo-legalize-to-linalg", "Legalize from LHLO dialect to Linalg dialect");
 }  // namespace xla_lhlo
 namespace xla_hlo {
 void populateHLOToLinalgConversionPattern(MLIRContext* context,
                                          OwningRewritePatternList* patterns) {
  patterns->insert<BroadcastConverter<xla_hlo::BroadcastOp, false>,
                   HloBroadcastInDimConverter,
                   PointwiseToLinalgConverter<xla_hlo::AbsOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::AddOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::AndOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::CeilOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::CompareOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::ComplexOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::ConvertOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::CosOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::DivOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::ImagOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::LogOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::MaxOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::MinOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::RealOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::RsqrtOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::SinOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::SqrtOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::SubOp, false>,
                   PointwiseToLinalgConverter<xla_hlo::TanhOp, false>,
                   ReshapeOpConverter<xla_hlo::ReshapeOp, false>,
                   ReverseConverter<xla_hlo::ReverseOp, false>,
                   TransposeConverter<xla_hlo::TransposeOp, false>>(context);
 }
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass() {
  return absl::make_unique<HloLegalizeToLinalg>();
 }
 static PassRegistration<HloLegalizeToLinalg> legalize_hlo_pass(
    "hlo-legalize-to-linalg", "Legalize from HLO dialect to Linalg dialect");
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/xla_transform_unranked_hlo.cc
+++ b/lib/Dialect/mhlo/transforms/xla_transform_unranked_hlo.cc
@ -0,0 +1,188 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/absl/memory/memory.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/Shape/IR/Shape.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Function.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/MLIRContext.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/Operation.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/PatternMatch.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/IR/StandardTypes.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Pass/Pass.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/Transforms/DialectConversion.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 namespace mlir {
 namespace xla_hlo {
 namespace {
 // TODO(frgossen): Make it variadic.
 template <typename OpTy>
 inline void AddLegalOpOnRankedTensor(ConversionTarget *target) {
  target->addDynamicallyLegalOp<OpTy>([](OpTy op) {
    return llvm::all_of((op.getOperation())->getOperandTypes(),
                        [&](Type t) { return t.isa<RankedTensorType>(); });
  });
 }
 /// Unary element-wise operations on unranked tensors can be applied to the
 /// flattened tensor with the same effect.
 /// This pattern rewrites every such operation to
 ///   (i)   flatten the input tensor,
 ///   (ii)  apply the unary operation, and
 ///   (iii) restore the original shape.
 template <typename OpTy>
 struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
  explicit UnaryElementwiseOpConversion(MLIRContext *context)
      : OpRewritePattern<OpTy>(context) {}
  LogicalResult matchAndRewrite(OpTy op,
                                PatternRewriter &rewriter) const override {
    // Don't apply conversion to ops with statically shaped operands.
    Value operand = op.getOperand();
    auto operandTy = operand.getType().dyn_cast<TensorType>();
    if (operandTy.hasRank()) return failure();
    // Generate IR to flatten the operand.
    auto loc = op.getLoc();
    Value shape = rewriter.create<shape::ShapeOfOp>(loc, operand);
    Value numElements = rewriter.create<shape::NumElementsOp>(
        loc, rewriter.getType<shape::SizeType>(), shape);
    Value numElementsAsIndex = rewriter.create<shape::SizeToIndexOp>(
        loc, rewriter.getIndexType(), numElements);
    Value flatShapeAsDimTensor =
        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
    auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                              operandTy.getElementType());
    Value flatOperand = rewriter.create<xla_hlo::DynamicReshapeOp>(
        loc, flatTensorTy, operand, flatShapeAsDimTensor);
    // Generate IR for the actual operation.
    Value flatResult = rewriter.create<OpTy>(loc, flatTensorTy, flatOperand);
    // Generate IR to restore the original shape.
    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                                rewriter.getIndexType());
    Value shapeAsExtentTensor =
        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
    Value result = rewriter.create<xla_hlo::DynamicReshapeOp>(
        loc, operandTy, flatResult, shapeAsExtentTensor);
    rewriter.replaceOp(op, result);
    return success();
  }
 };
 /// Binary element-wise operation on unranked tensors can be applied to the
 /// flattened operand tensors with the same effect.
 /// This pattern rewrites every such operation to
 ///   (i)   flatten the operand tensors,
 ///   (ii)  apply the binary operation, and
 //    (iii) restore the original shape.
 template <typename OpTy>
 struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
  explicit BinaryElementwiseOpConversion(MLIRContext *context)
      : OpRewritePattern<OpTy>(context) {}
  LogicalResult matchAndRewrite(OpTy op,
                                PatternRewriter &rewriter) const override {
    // Don't apply conversion unless both operands are unranked.
    if (op.lhs().getType().template isa<RankedTensorType>() ||
        op.rhs().getType().template isa<RankedTensorType>()) {
      return failure();
    }
    // Flatten operands.
    Type shapeTy = shape::ShapeType::get(rewriter.getContext());
    auto loc = op.getLoc();
    Value shapeLhs = rewriter.create<shape::ShapeOfOp>(loc, op.lhs());
    Value shapeRhs = rewriter.create<shape::ShapeOfOp>(loc, op.rhs());
    Value shape = rewriter.create<shape::AnyOp>(loc, shapeTy,
                                                ValueRange{shapeLhs, shapeRhs});
    Value numElements = rewriter.create<shape::NumElementsOp>(loc, shape);
    Value numElementsAsIndex =
        rewriter.create<shape::SizeToIndexOp>(loc, numElements);
    Value flatShape =
        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
    TensorType lhsTy = op.lhs().getType().template cast<TensorType>();
    Type flatLhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                           lhsTy.getElementType());
    Value flatLhs =
        rewriter.create<DynamicReshapeOp>(loc, flatLhsTy, op.lhs(), flatShape);
    TensorType rhsTy = op.rhs().getType().template cast<TensorType>();
    Type flatRhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                           rhsTy.getElementType());
    Value flatRhs =
        rewriter.create<DynamicReshapeOp>(loc, flatRhsTy, op.rhs(), flatShape);
    // Apply actual operation to flattened operands.
    Value flatResult = rewriter.create<OpTy>(loc, flatLhs, flatRhs);
    // Restore original shape.
    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                                rewriter.getIndexType());
    Value shapeAsExtentTensor =
        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
    Value result = rewriter.create<DynamicReshapeOp>(
        loc, op.getType(), flatResult, shapeAsExtentTensor);
    rewriter.replaceOp(op, result);
    return success();
  }
 };
 struct TransformUnrankedHloPass
    : public PassWrapper<TransformUnrankedHloPass, FunctionPass> {
  void runOnFunction() override {
    // Setup conversion target.
    MLIRContext &ctx = getContext();
    ConversionTarget target(ctx);
    target.addLegalDialect<XlaHloDialect, StandardOpsDialect,
                           shape::ShapeDialect>();
    target.addLegalOp<FuncOp>();
    AddLegalOpOnRankedTensor<SqrtOp>(&target);
    AddLegalOpOnRankedTensor<AddOp>(&target);
    // Populate rewrite patterns.
    OwningRewritePatternList patterns;
    PopulateTransformUnrankedHloPatterns(&ctx, &patterns);
    // Apply transformation.
    if (failed(applyFullConversion(getFunction(), target, patterns)))
      return signalPassFailure();
  }
 };
 }  // namespace
 void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
                                          OwningRewritePatternList *patterns) {
  // TODO(frgossen): Populate all unary and binary operations.
  // clang-format off
  patterns->insert<
      BinaryElementwiseOpConversion<AddOp>,
      UnaryElementwiseOpConversion<SqrtOp>>(context);
  // clang-format on
 }
 static PassRegistration<TransformUnrankedHloPass> transform_unranked_hlo_pass(
    "transform-unranked-hlo",
    "Realize element-wise operations on ranked tensors where possible");
 }  // namespace xla_hlo
 }  // namespace mlir
--- a/lib/utils/cycle_detector.cc
+++ b/lib/utils/cycle_detector.cc
@ -0,0 +1,340 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/cycle_detector.h"
 #include <algorithm>
 #include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/DenseSet.h"
 namespace mlir {
 namespace {
 using NodeSet = llvm::DenseSet<int32_t>;
 using OrderedNodeSet = OrderedSet<int32_t>;
 template <typename T>
 struct VecStruct {
  typedef llvm::SmallVector<T, 4> type;
 };
 template <typename T>
 using Vec = typename VecStruct<T>::type;
 struct Node {
  // rank number assigned by Pearce-Kelly algorithm
  int32_t rank;
  // Temporary marker used by depth-first-search
  bool visited;
  // User-supplied data
  void* data;
  // List of immediate predecessor nodes in graph
  OrderedNodeSet in;
  // List of immediate successor nodes in graph
  OrderedNodeSet out;
 };
 }  // namespace
 struct GraphCycles::Rep {
  Vec<Node*> nodes;
  // Indices for unused entries in nodes
  Vec<int32_t> free_nodes;
  // Temporary state.
  // Results of forward DFS
  Vec<int32_t> deltaf;
  // Results of backward DFS
  Vec<int32_t> deltab;
  // All nodes to reprocess
  Vec<int32_t> list;
  // Rank values to assign to list entries
  Vec<int32_t> merged;
  // Emulates recursion stack when doing depth first search
  Vec<int32_t> stack;
 };
 GraphCycles::GraphCycles(int32_t num_nodes) : rep_(new Rep) {
  rep_->nodes.reserve(num_nodes);
  for (int32_t i = 0; i < num_nodes; ++i) {
    Node* n = new Node;
    n->visited = false;
    n->data = nullptr;
    n->rank = rep_->nodes.size();
    rep_->nodes.push_back(n);
  }
 }
 GraphCycles::~GraphCycles() {
  for (Vec<Node*>::size_type i = 0, e = rep_->nodes.size(); i < e; ++i) {
    delete rep_->nodes[i];
  }
  delete rep_;
 }
 bool GraphCycles::HasEdge(int32_t x, int32_t y) const {
  return rep_->nodes[x]->out.Contains(y);
 }
 void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
  rep_->nodes[x]->out.Erase(y);
  rep_->nodes[y]->in.Erase(x);
  // No need to update the rank assignment since a previous valid
  // rank assignment remains valid after an edge deletion.
 }
 static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound);
 static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound);
 static void Reorder(GraphCycles::Rep* r);
 static void Sort(const Vec<Node*>&, Vec<int32_t>* delta);
 static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
                       Vec<int32_t>* dst);
 static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32_t>& nodes);
 bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
  if (x == y) return false;
  Rep* r = rep_;
  Node* nx = r->nodes[x];
  if (!nx->out.Insert(y)) {
    // Edge already exists.
    return true;
  }
  Node* ny = r->nodes[y];
  ny->in.Insert(x);
  if (nx->rank <= ny->rank) {
    // New edge is consistent with existing rank assignment.
    return true;
  }
  // Current rank assignments are incompatible with the new edge.  Recompute.
  // We only need to consider nodes that fall in the range [ny->rank,nx->rank].
  if (ForwardDFS(r, y, nx->rank)) {
    // Found a cycle.  Undo the insertion and tell caller.
    nx->out.Erase(y);
    ny->in.Erase(x);
    // Since we do not call Reorder() on this path, clear any visited
    // markers left by ForwardDFS.
    ClearVisitedBits(r, r->deltaf);
    return false;
  }
  BackwardDFS(r, x, ny->rank);
  Reorder(r);
  return true;
 }
 // Follows the edges from producer to consumer and searchs if the node having
 // rank `n` can reach the node having rank `upper_bound` using a DFS search.
 // When doing DFS search, We only consider the pathes that satisfy the ranks
 // of the nodes of the path are all smaller than `upper_bound`.
 //
 // Returns true if such path exists.
 static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound) {
  // Avoid recursion since stack space might be limited.
  // We instead keep a stack of nodes to visit.
  r->deltaf.clear();
  r->stack.clear();
  r->stack.push_back(n);
  while (!r->stack.empty()) {
    n = r->stack.back();
    r->stack.pop_back();
    Node* nn = r->nodes[n];
    if (nn->visited) continue;
    nn->visited = true;
    r->deltaf.push_back(n);
    for (auto w : nn->out.GetSequence()) {
      Node* nw = r->nodes[w];
      if (nw->rank == upper_bound) {
        return true;
      }
      if (!nw->visited && nw->rank < upper_bound) {
        r->stack.push_back(w);
      }
    }
  }
  return false;
 }
 // Follows the edges from consumer to producer and visit all the nodes that
 // is reachable from node `n` and have rank larger than `lower_bound`.
 static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound) {
  r->deltab.clear();
  r->stack.clear();
  r->stack.push_back(n);
  while (!r->stack.empty()) {
    n = r->stack.back();
    r->stack.pop_back();
    Node* nn = r->nodes[n];
    if (nn->visited) continue;
    nn->visited = true;
    r->deltab.push_back(n);
    for (auto w : nn->in.GetSequence()) {
      Node* nw = r->nodes[w];
      if (!nw->visited && lower_bound < nw->rank) {
        r->stack.push_back(w);
      }
    }
  }
 }
 // Recomputes rank assignments to make them compatible with the edges (producer
 // has smaller rank than its consumer)
 static void Reorder(GraphCycles::Rep* r) {
  Sort(r->nodes, &r->deltab);
  Sort(r->nodes, &r->deltaf);
  // Adds contents of delta lists to list (backwards deltas first).
  r->list.clear();
  MoveToList(r, &r->deltab, &r->list);
  MoveToList(r, &r->deltaf, &r->list);
  // Produce sorted list of all ranks that will be reassigned.
  r->merged.resize(r->deltab.size() + r->deltaf.size());
  std::merge(r->deltab.begin(), r->deltab.end(), r->deltaf.begin(),
             r->deltaf.end(), r->merged.begin());
  // Assign the ranks in order to the collected list.
  for (Vec<int32_t>::size_type i = 0, e = r->list.size(); i < e; ++i) {
    r->nodes[r->list[i]]->rank = r->merged[i];
  }
 }
 // Sorts nodes in the vector according to their ranks. Small rank first.
 static void Sort(const Vec<Node*>& nodes, Vec<int32_t>* delta) {
  struct ByRank {
    const Vec<Node*>* nodes;
    bool operator()(int32_t a, int32_t b) const {
      return (*nodes)[a]->rank < (*nodes)[b]->rank;
    }
  };
  ByRank cmp;
  cmp.nodes = &nodes;
  std::sort(delta->begin(), delta->end(), cmp);
 }
 // Collects ranks of nodes in vector `src` to vector `dst`
 static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
                       Vec<int32_t>* dst) {
  for (Vec<int32_t>::size_type i = 0, e = src->size(); i < e; i++) {
    int32_t w = (*src)[i];
    // Replace src entry with its rank
    (*src)[i] = r->nodes[w]->rank;
    // Prepare for future DFS calls
    r->nodes[w]->visited = false;
    dst->push_back(w);
  }
 }
 // Clears bookkeeping fileds used during the last DFS process.
 static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32_t>& nodes) {
  for (Vec<int32_t>::size_type i = 0, e = nodes.size(); i < e; i++) {
    r->nodes[nodes[i]]->visited = false;
  }
 }
 bool GraphCycles::IsReachable(int32_t x, int32_t y) {
  if (x == y) return true;
  Rep* r = rep_;
  Node* nx = r->nodes[x];
  Node* ny = r->nodes[y];
  if (nx->rank >= ny->rank) {
    // x cannot reach y since it is after it in the topological ordering
    return false;
  }
  // See if x can reach y using a DFS search that is limited to y's rank
  bool reachable = ForwardDFS(r, x, ny->rank);
  // Clear any visited markers left by ForwardDFS.
  ClearVisitedBits(r, r->deltaf);
  return reachable;
 }
 llvm::Optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
  assert(HasEdge(a, b));
  RemoveEdge(a, b);
  if (IsReachable(a, b)) {
    // Restore the graph to its original state.
    InsertEdge(a, b);
    return {};
  }
  if (rep_->nodes[b]->in.Size() + rep_->nodes[b]->out.Size() >
      rep_->nodes[a]->in.Size() + rep_->nodes[a]->out.Size()) {
    // Swap "a" and "b" to minimize copying.
    std::swap(a, b);
  }
  Node* nb = rep_->nodes[b];
  OrderedNodeSet out = std::move(nb->out);
  OrderedNodeSet in = std::move(nb->in);
  for (int32_t y : out.GetSequence()) {
    rep_->nodes[y]->in.Erase(b);
  }
  for (int32_t y : in.GetSequence()) {
    rep_->nodes[y]->out.Erase(b);
  }
  rep_->free_nodes.push_back(b);
  rep_->nodes[a]->out.Reserve(rep_->nodes[a]->out.Size() + out.Size());
  for (int32_t y : out.GetSequence()) {
    InsertEdge(a, y);
  }
  rep_->nodes[a]->in.Reserve(rep_->nodes[a]->in.Size() + in.Size());
  for (int32_t y : in.GetSequence()) {
    InsertEdge(y, a);
  }
  // Note, if the swap happened it might be what originally was called "b".
  return a;
 }
 std::vector<int32_t> GraphCycles::SuccessorsCopy(int32_t node) const {
  return rep_->nodes[node]->out.GetSequence();
 }
 namespace {
 void SortInPostOrder(const Vec<Node*>& nodes, std::vector<int32_t>* to_sort) {
  std::sort(to_sort->begin(), to_sort->end(), [&](int32_t a, int32_t b) {
    return nodes[a]->rank > nodes[b]->rank;
  });
 }
 }  // namespace
 std::vector<int32_t> GraphCycles::AllNodesInPostOrder() const {
  llvm::DenseSet<int32_t> free_nodes_set;
  for (int32_t n : rep_->free_nodes) free_nodes_set.insert(n);
  std::vector<int32_t> all_nodes;
  all_nodes.reserve(rep_->nodes.size() - free_nodes_set.size());
  for (size_t i = 0, e = rep_->nodes.size(); i < e; i++) {
    if (!free_nodes_set.count(i)) {
      all_nodes.push_back(i);
    }
  }
  SortInPostOrder(rep_->nodes, &all_nodes);
  return all_nodes;
 }
 }  // namespace mlir
--- a/lib/utils/cycle_detector_test.cc
+++ b/lib/utils/cycle_detector_test.cc
@ -0,0 +1,89 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "third_party/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/cycle_detector.h"
 #include "third_party/tensorflow/compiler/xla/test.h"
 class GraphCyclesTest : public ::testing::Test {
 public:
  GraphCyclesTest() : g_(100) {}
  bool AddEdge(int x, int y) { return g_.InsertEdge(x, y); }
  void AddMultiples() {
    // For every node x > 0: add edge to 2*x, 3*x
    for (int x = 1; x < 25; x++) {
      EXPECT_TRUE(AddEdge(x, 2 * x)) << x;
      EXPECT_TRUE(AddEdge(x, 3 * x)) << x;
    }
  }
  mlir::GraphCycles g_;
 };
 TEST_F(GraphCyclesTest, NoCycle) { AddMultiples(); }
 TEST_F(GraphCyclesTest, SimpleCycle) {
  AddMultiples();
  EXPECT_FALSE(AddEdge(8, 4));
 }
 TEST_F(GraphCyclesTest, IndirectCycle) {
  AddMultiples();
  EXPECT_TRUE(AddEdge(16, 9));
  EXPECT_FALSE(AddEdge(9, 2));
 }
 TEST_F(GraphCyclesTest, RemoveEdge) {
  EXPECT_TRUE(AddEdge(1, 2));
  EXPECT_TRUE(AddEdge(2, 3));
  EXPECT_TRUE(AddEdge(3, 4));
  EXPECT_TRUE(AddEdge(4, 5));
  g_.RemoveEdge(2, 3);
  EXPECT_FALSE(g_.HasEdge(2, 3));
 }
 TEST_F(GraphCyclesTest, IsReachable) {
  EXPECT_TRUE(AddEdge(1, 2));
  EXPECT_TRUE(AddEdge(2, 3));
  EXPECT_TRUE(AddEdge(3, 4));
  EXPECT_TRUE(AddEdge(4, 5));
  EXPECT_TRUE(g_.IsReachable(1, 5));
  EXPECT_FALSE(g_.IsReachable(5, 1));
 }
 TEST_F(GraphCyclesTest, ContractEdge) {
  ASSERT_TRUE(AddEdge(1, 2));
  ASSERT_TRUE(AddEdge(1, 3));
  ASSERT_TRUE(AddEdge(2, 3));
  ASSERT_TRUE(AddEdge(2, 4));
  ASSERT_TRUE(AddEdge(3, 4));
  // It will introduce a cycle if the edge is contracted
  EXPECT_FALSE(g_.ContractEdge(1, 3).hasValue());
  EXPECT_TRUE(g_.HasEdge(1, 3));
  // Node (2) has more edges.
  EXPECT_EQ(*g_.ContractEdge(1, 2), 2);
  EXPECT_TRUE(g_.HasEdge(2, 3));
  EXPECT_TRUE(g_.HasEdge(2, 4));
  EXPECT_TRUE(g_.HasEdge(3, 4));
  // Node (2) has more edges.
  EXPECT_EQ(*g_.ContractEdge(2, 3), 2);
  EXPECT_TRUE(g_.HasEdge(2, 4));
 }