PR #50020: [MLIR][DISC] support fusion on buffer

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/50020 This pass implements the logic to group kLoop/kInput fusion patterns on buffer level. The reason for this is that we can avoid a lot of headaches to handle `shape-only` consumers specially (e.g. memref.dim, shape.shapeOf) since shapes are already resolved in buffer world. It may be better to move this pass to tensor level after more shape inference/constraint infras are ready on mhlo level. Copybara import of the project: -- e31f8344b59aa9860097197585215ea1689b8ff4 by Wenyi Zhao <reyizero@gmail.com>: [MLIR][DISC] support fusion on buffer This pass implements the logic to group kLoop/kInput fusion patterns on buffer level. The reason for this is that we can avoid a lot of headaches to handle `shape-only` consumers specially (e.g. memref.dim, shape.shapeOf) since shapes are already resolved in buffer world. It may be better to move this pass to tensor level after more shape inference/constraint infras are ready on mhlo level. -- 35f2eb2791241b0ab5db1ddcaf1b4006278ddccf by Wenyi Zhao <reyizero@gmail.com>: fix -- 923c8d61f7fe00a2a0df22d5be396508f0667964 by Wenyi Zhao <reyizero@gmail.com>: fix sanity check failure PiperOrigin-RevId: 379743424
2021-06-16 09:50:41 -07:00 · 2021-06-16 09:50:41 -07:00 · 34dc5f2a79
parent 82696f8598
commit 34dc5f2a79
9 changed files with 1555 additions and 0 deletions
--- a/38
+++ b/38
@ -1213,6 +1213,42 @@ cc_library(
    ],
 )
 cc_library(
    name = "fusion_utils",
    srcs = ["lib/Dialect/mhlo/transforms/fusion_utils.cc"],
    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/fusion_utils.h"],
    deps = [
        ":lhlo",
        "@llvm-project//llvm:Core",
        "@llvm-project//llvm:Support",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:Shape",
        "@llvm-project//mlir:StandardOps",
        "@llvm-project//mlir:Support",
    ],
    alwayslink = 1,
 )
 cc_library(
    name = "lhlo_fusion",
    srcs = ["lib/Dialect/mhlo/transforms/lhlo_fusion.cc"],
    deps = [
        ":cycle_detector",
        ":fusion_utils",
        ":lhlo",
        ":pass_details",
        "@llvm-project//llvm:Core",
        "@llvm-project//llvm:Support",
        "@llvm-project//mlir:IR",
        "@llvm-project//mlir:Pass",
        "@llvm-project//mlir:Shape",
        "@llvm-project//mlir:StandardOps",
        "@llvm-project//mlir:Support",
        "@llvm-project//mlir:TransformUtils",
    ],
    alwayslink = 1,
 )
 cc_library(
    name = "chlo_legalize_to_hlo",
    srcs = ["lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc"],
@ -1259,6 +1295,7 @@ cc_library(
    ],
    deps = [
        ":DiscRalPassIncGen",
        ":LmhloPassIncGen",
        ":MhloPassIncGen",
        "@llvm-project//mlir:Pass",
    ],
@ -1316,6 +1353,7 @@ cc_library(
        ":legalize_trigonometric_to_approximation",
        ":lhlo",
        ":lhlo_fuse_linalg",
        ":lhlo_fusion",
        ":lhlo_legalize_to_affine",
        ":lhlo_legalize_to_gpu",
        ":lhlo_legalize_to_parallel_loops",
--- a/include/mlir-hlo/Dialect/mhlo/transforms/PassDetail.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/PassDetail.h
@ -25,6 +25,14 @@ namespace mhlo {
 #include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
 }  // end namespace mhlo
 namespace lmhlo {
 #define GEN_PASS_CLASSES
 #include "mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.h.inc"
 }  // end namespace lmhlo
 }  // end namespace mlir
 namespace mlir {
--- a/include/mlir-hlo/Dialect/mhlo/transforms/fusion_utils.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/fusion_utils.h
@ -0,0 +1,244 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_FUSION_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_FUSION_UTILS_H_
 #include <memory>
 #include <vector>
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Support/Debug.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 // This file implements some helper functions and classes used to do fusion
 // & code generation.
 namespace mlir {
 namespace lmhlo {
 // kLoop fusion template satisfies:
 //   - all ops in the fusion pattern are element-wise.
 //   - all the shapes of outputs of fusion pattern are same or have same number
 //   of elements, and thus can fit into a same parallel loop.
 //
 // kInput fusion template satisfies:
 //   - any op in the fusion pattern is either element-wise or a reduction.
 //   - if a op is a reduction, its output cannot be consumed by other
 //     ops in the same fusion pattern.
 //   - all the effective shapes of outputs of fusion pattern are same.
 //     - For element-wise op, its effective shape is its output shape.
 //     - For reduction op, its effective shape is its operand shape.
 //   - currently our downstreaming codegen engine only support 2d -> 1d tensor
 //   reduction. TODO: lift this limitation.
 //     - 2D row reduction: out[i] = sum({in[i][j] for all j})
 //     - 2D column reduction: out[j] = sum({in[i][j] for all i})
 enum FusionType {
  // Not a fusion pattern
  kNone,
  // kLoop fusion pattern
  kLoop,
  // kInput fusion pattern and all reduce ops of the fused pattern are row
  // reduction
  kRowReduction,
  // kInput fusion pattern and all reduce ops of the fused pattern are column
  // reduction
  kColReduction,
 };
 // Returns true if the op is an elementwise unary lmhlo op.
 // TODO: use fusibility interface
 bool isElementWiseUnary(Operation* op);
 // Returns true if the op is an elementwise binary lmhlo op.
 // TODO: use fusibility interface
 bool isElementWiseBinary(Operation* op);
 // Returns true if the op is an elementwise lmhlo op.
 // TODO: use fusibility interface
 bool isElementWise(Operation* op);
 // Returns true if this op is a rank-2 row reduction.
 bool isRank2RowReduction(Operation* op);
 // Returns true if this op is a rank-2 column reduction.
 bool isRank2ColReduction(Operation* op);
 // Returns true if the op is supported by the downstreaming fusion codegen
 // engine.
 bool isFusible(Operation* op);
 // Returns the number of operands that are supposed to be written.
 // For some ops (e.g. lmhlo ops), some operands are the output memrefs
 // Thus these operands are supposed to be updated.
 int getNumResultOperands(Operation* op);
 // Returns data users of the value and its aliases (e.g. memref.cast).
 // Here non-data users means DimOp, DeallocOp and ShapeOfOp.
 SmallVector<Operation*, 4> getValueUsers(Value v);
 // Represents a list of lmhlo ops that are going to be fused.
 class FusionPattern {
 public:
  using FusionOpList = SmallVector<Operation*, 4>;
  using FusionValueList = SmallVector<Value, 4>;
  // Create a new fusion pattern from a single op.
  FusionPattern(Operation* op);
  // Create a new fusion pattern from the ops inside the lmhlo fusion op.
  FusionPattern(lmhlo::FusionOp op);
  // Returns the op list this fusion pattern represents.
  FusionOpList& getOpList() { return op_list_; }
  // Returns the dominant op of this fusion pattern.
  // For kLoop fusion, a dominant op may be any op that has external users.
  // For kInput fusion, a dominant op may be a row reduction (if exists), or
  // a column reduction op.
  Operation* getDominantOp() { return dominant_op_; }
  // Sets the dominant op to the op provided.
  void setDominantOp(Operation* op) { dominant_op_ = op; }
  // Returns the fusion kind of the fusion pattern.
  FusionType getFusionType() { return fusion_type_; }
  // Sets the fusion type to the the type provided.
  void setFusionType(FusionType type) { fusion_type_ = type; }
  // Returns true if this a fusible fusion pattern.
  bool isFusible() { return getFusionType() != FusionType::kNone; }
  // Returns true if this fusion pattern is a kLoop fusion.
  bool isKLoopFusion() { return getFusionType() == FusionType::kLoop; }
  // Returns true if this fusion pattern is a kInput fusion.
  bool isKInputFusion() {
    return (getFusionType() == FusionType::kRowReduction ||
            getFusionType() == FusionType::kColReduction);
  }
  // Returns true if two fusion patterns can be merged into one bigger fusion
  // pattern.
  bool isMergeable(FusionPattern& other);
  // Merges two fusion patterns and returns the merged pattern. The original
  // pattern remains unmodified.
  FusionPattern merge(FusionPattern& other);
  // Merges two fusion patterns and returns the merged pattern. Replaces the
  // original pattern with new merged pattern.
  FusionPattern& mergeInplace(FusionPattern& other);
  // Returns values that are consumed by the lmhlo ops inside the fusion
  // pattern.
  FusionValueList& getOperands() { return operands_; }
  // Returns values that are outputs of any lmhlo op in the fused pattern and
  // have consumers outside the fusion pattern.
  FusionValueList& getResults() { return results_; }
  // Returns values that are outputs of any lmhlo op in the fused pattern and
  // are only consumed by the lmhlo ops inside the fused pattern.
  FusionValueList& getInternalResults() { return internal_results_; }
  // Returns the size of the ops this fusion pattern contains.
  int size() { return op_list_.size(); }
  // Returns the effective size (e.g. not counting const ops) of the ops this
  // fusion pattern contains.
  int effectiveSize();
  // Sorts the ops inside the fusion pattern according to the keys provided.
  void sortFusionOpListBy(DenseMap<Operation*, int>& op_to_idx);
 private:
  FusionPattern(SmallVectorImpl<Operation*>& op_list);
 private:
  // Calculates the inputs and outputs of the fusion pattern.
  void calculateOperandsAndResults();
 private:
  FusionOpList op_list_;
  Operation* dominant_op_ = nullptr;
  FusionType fusion_type_ = FusionType::kNone;
  FusionValueList operands_;
  FusionValueList results_;
  FusionValueList internal_results_;
 };
 // Represents a list of disjoint fusion patterns for a block.
 using FusionPlan = std::vector<FusionPattern>;
 using llvm::EquivalenceClasses;
 // Supports using EquivalenceClasses for Value
 class ValueWrapper {
 public:
  explicit ValueWrapper(Value value) : value_(std::move(value)) {}
  Value getValue() const { return value_; }
  bool operator==(const ValueWrapper& rhs) const {
    return getValue() == rhs.getValue();
  }
 private:
  Value value_;
 };
 bool operator<(const ValueWrapper& lhs, const ValueWrapper& rhs);
 // This is a simple shape constraint analysis, which is used to
 // guide fusion decision (e.g. we only fuse shape-compatible ops).
 //
 // Currently, We only consider shape equality and same-number-elements equality
 // propagation based on the shape constraint traits of elementwise ops (assuming
 // that implicit shape broadcast is forbidden).
 class ShapeConstraintAnalysis {
 public:
  explicit ShapeConstraintAnalysis(const SmallVectorImpl<Operation*>& op_list) {
    PropagateEquality(op_list);
  }
  // Returns true if `lhs` and `rhs` are supposed to have same shape.
  bool HasSameShape(Value lhs, Value rhs) {
    return same_shape_impl_.isEquivalent(ValueWrapper(lhs), ValueWrapper(rhs));
  }
  // Returns true if `lhs` and `rhs` are supposed to have same number of
  // elements.
  bool HasSameNumElements(Value lhs, Value rhs) {
    return same_num_elements_impl_.isEquivalent(ValueWrapper(lhs),
                                                ValueWrapper(rhs));
  }
 private:
  // shape equality propagation based on the shape constrains of
  // elementwise ops.
  void PropagateEquality(const SmallVectorImpl<Operation*>& op_list);
  // a UnionFind set
  EquivalenceClasses<ValueWrapper> same_shape_impl_;
  EquivalenceClasses<ValueWrapper> same_num_elements_impl_;
 };
 }  // namespace lmhlo
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_FUSION_UTILS_H_
--- a/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
@ -56,3 +56,12 @@ def LegalizeTensorLoadOpPass : Pass<"lhlo-legalize-tensor-load-op", "FuncOp"> {
  let constructor = "createLegalizeTensorLoadOpPass()";
 }
 def LhloFusionPass : FunctionPass<"lhlo-fusion"> {
  let summary = "Fuse lmhlo ops to kLoop/kInput fusion patterns.";
  let constructor = "createLhloFusionPass()";
  let options = [
    Option<"max_num_arguments_per_kernel_", "max-num-arguments-per-kernel", "int",
           /*default=*/"64", "Maximum allowed number of arguments per fused kernel.">,
  ];
 }
--- a/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ b/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
@ -117,6 +117,10 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 // Legalizes tensor load ops that are inserted during mhlo to lmhlo conversion.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTensorLoadOpPass();
 // fuse lmhlo ops to kLoop/kInput fusion patterns
 std::unique_ptr<OperationPass<FuncOp>> createLhloFusionPass(
    int max_num_arguments_per_kernel = 64);
 }  // namespace lmhlo
 namespace disc_ral {
--- a/lib/Dialect/mhlo/transforms/CMakeLists.txt
+++ b/lib/Dialect/mhlo/transforms/CMakeLists.txt
@ -137,8 +137,10 @@ add_mlir_library(MhloLhloToLinalg
 )
 add_mlir_library(LmhloPasses
  fusion_utils.cc
  legalize_tensor_load_op.cc
  lhlo_fuse_linalg.cc
  lhlo_fusion.cc
  lhlo_legalize_to_affine.cc
  lhlo_legalize_to_gpu.cc
  lhlo_legalize_to_parallel_loops.cc
--- a/lib/Dialect/mhlo/transforms/fusion_utils.cc
+++ b/lib/Dialect/mhlo/transforms/fusion_utils.cc
@ -0,0 +1,394 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "mlir-hlo/Dialect/mhlo/transforms/fusion_utils.h"
 #include <algorithm>
 #include "mlir/Dialect/Shape/IR/Shape.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"          // TF:llvm-project
 #include "mlir/IR/Matchers.h"
 // This file implements some helper functions and classes used to do fusion
 // & code generation.
 namespace mlir {
 namespace lmhlo {
 // Returns true if the op is an elementwise unary lmhlo op.
 // TODO(disc): use fusibility interface
 bool isElementWiseUnary(Operation* op) {
  // clang-format off
  return isa<
    lmhlo::AbsOp,
    lmhlo::CeilOp,
    lmhlo::ConvertOp,
    lmhlo::CopyOp,
    lmhlo::CosOp,
    lmhlo::ExpOp,
    lmhlo::FloorOp,
    lmhlo::IsFiniteOp,
    lmhlo::LogOp,
    lmhlo::NegOp,
    lmhlo::NotOp,
    lmhlo::RsqrtOp,
    lmhlo::SignOp,
    lmhlo::SqrtOp,
    lmhlo::TanhOp
  >(op);
  // clang-format on
 }
 // Returns true if the op is an elementwise binary lmhlo op.
 // TODO(disc): use fusibility interface
 bool isElementWiseBinary(Operation* op) {
  // clang-format off
  return isa<
    lmhlo::AddOp,
    lmhlo::AndOp,
    lmhlo::CompareOp,
    lmhlo::DivOp,
    lmhlo::MaxOp,
    lmhlo::MinOp,
    lmhlo::MulOp,
    lmhlo::OrOp,
    lmhlo::PowOp,
    lmhlo::SubOp
  >(op);
  // clang-format on
 }
 // Returns true if the op is an elementwise lmhlo op.
 // TODO(disc): use fusibility interface
 bool isElementWise(Operation* op) {
  return isElementWiseUnary(op) || isElementWiseBinary(op);
 }
 // Returns true if this op is a rank-2 row reduction.
 bool isRank2RowReduction(Operation* op) {
  auto reduce_op = dyn_cast<lmhlo::ReduceOp>(op);
  if (!reduce_op || reduce_op.dimensions().getNumElements() != 1) return false;
  int rank = op->getOperand(0).getType().cast<MemRefType>().getRank();
  auto dimensions = reduce_op.dimensions().getValues<int64_t>();
  return ((*dimensions.begin() == 1) && (rank == 2));
 }
 // Returns true if this op is a rank-2 column reduction.
 bool isRank2ColReduction(Operation* op) {
  auto reduce_op = dyn_cast<lmhlo::ReduceOp>(op);
  if (!reduce_op || reduce_op.dimensions().getNumElements() != 1) return false;
  int rank = op->getOperand(0).getType().cast<MemRefType>().getRank();
  auto dimensions = reduce_op.dimensions().getValues<int64_t>();
  return ((*dimensions.begin() == 0) && (rank == 2));
 }
 // Returns true if the op is supported by the downstreaming fusion codegen
 // engine.
 bool isFusible(Operation* op) {
  // Only scalar const are supported by the fusion codegen engine a.t.m.
  if (dyn_cast<lmhlo::ConstOp>(op)) {
    MemRefType type = op->getOperand(0).getType().cast<MemRefType>();
    return (type.getRank() == 0);
  }
  // All element ops are supported by the fusion codegen engine.
  if (isElementWise(op)) return true;
  // Only rank-2 tensor -> rank-1 tensor reduction are supported now.
  if (isRank2RowReduction(op) || isRank2ColReduction(op)) return true;
  // clang-format off
  return isa<
    lmhlo::BroadcastInDimOp,
    lmhlo::BroadcastOp,
    lmhlo::ConcatenateOp,
    lmhlo::DynamicBroadcastInDimOp,
    lmhlo::DynamicGatherOp,
    lmhlo::DynamicIotaOp,
    lmhlo::DynamicPadOp,
    lmhlo::DynamicReshapeOp,
    lmhlo::GatherOp,
    lmhlo::RealDynamicSliceOp,
    lmhlo::ReshapeOp,
    lmhlo::SelectOp,
    lmhlo::SliceOp,
    lmhlo::TransposeOp
  >(op);
  // clang-format on
 }
 // Returns the number of operands that are supposed to be written.
 // For some ops (e.g. lmhlo ops), some operands are the output memrefs
 // Thus these operands are supposed to be updated.
 int getNumResultOperands(Operation* op) {
  if (op->getDialect()->getNamespace() != "lmhlo") {
    return 0;
  }
  auto isWritable = [&](Value operand) -> bool {
    llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 2> effects;
    MemoryEffectOpInterface interface = dyn_cast<MemoryEffectOpInterface>(op);
    // Suppose that operands of op without `MemoryEffectOpInterface` are
    // readonly.
    if (!interface) return false;
    interface.getEffectsOnValue(operand, effects);
    return llvm::any_of(
        effects, [](const mlir::MemoryEffects::EffectInstance& instance) {
          return mlir::isa<mlir::MemoryEffects::Write>(instance.getEffect());
        });
  };
  return llvm::count_if(op->getOperands(),
                        [&](Value v) { return isWritable(v); });
 }
 // Returns data users of the value and its aliases (e.g. memref.cast).
 // Here non-data users means DimOp, DeallocOp and ShapeOfOp.
 SmallVector<Operation*, 4> getValueUsers(Value v) {
  SmallVector<Operation*, 4> users;
  SmallVector<Value, 4> worklist;
  worklist.push_back(v);
  while (!worklist.empty()) {
    Value curr = worklist.back();
    worklist.pop_back();
    for (Operation* user : curr.getUsers()) {
      // Skip non-data users
      if (isa<memref::DimOp, memref::DeallocOp, shape::ShapeOfOp>(user)) {
        continue;
      }
      // alias value
      if (isa<memref::CastOp>(user)) {
        worklist.push_back(user->getResult(0));
      } else {
        users.push_back(user);
      }
    }
  }
  return users;
 }
 // Create a new fusion pattern from a single op.
 FusionPattern::FusionPattern(Operation* op) {
  op_list_.push_back(op);
  if (isRank2RowReduction(op)) {
    fusion_type_ = FusionType::kRowReduction;
  } else if (isRank2ColReduction(op)) {
    fusion_type_ = FusionType::kColReduction;
  } else if (mlir::lmhlo::isFusible(op)) {
    fusion_type_ = FusionType::kLoop;
  } else {
    fusion_type_ = FusionType::kNone;
  }
  dominant_op_ = op;
  calculateOperandsAndResults();
 }
 // Create a new fusion pattern from the ops inside the lmhlo fusion op.
 FusionPattern::FusionPattern(lmhlo::FusionOp op) {
  for (Operation& op : op.region().getBlocks().front()) {
    op_list_.push_back(&op);
  }
  // Figure out fusion type and dominant op for the fusion pattern.
  for (Operation* op : op_list_) {
    if (isRank2RowReduction(op)) {
      fusion_type_ = FusionType::kRowReduction;
      dominant_op_ = op;
    } else if (isRank2ColReduction(op)) {
      if (fusion_type_ != FusionType::kRowReduction) {
        fusion_type_ = FusionType::kColReduction;
        dominant_op_ = op;
      }
    } else if (lmhlo::isFusible(op)) {
      // Ignore if already a kRowReduction or kColReduction, otherwise update
      // the fusion type to kLoop and dominant op to current op. This supposes
      // that the last op inside the block is a valid candidate dominant op if
      // the fusion pattern is a kLoop.
      if (fusion_type_ == FusionType::kNone ||
          fusion_type_ == FusionType::kLoop) {
        fusion_type_ = FusionType::kLoop;
        dominant_op_ = op;
      }
    } else {
      // Not a supported fusionOp, early stop.
      fusion_type_ = FusionType::kNone;
      dominant_op_ = nullptr;
      break;
    }
  }
  if (isFusible()) calculateOperandsAndResults();
 }
 // Create a new fusion pattern from a valid fusion op list.
 FusionPattern::FusionPattern(SmallVectorImpl<Operation*>& op_list)
    : op_list_(op_list.begin(), op_list.end()) {
  calculateOperandsAndResults();
 }
 // Returns true if two fusion patterns can be merged into one bigger fusion
 // pattern.
 bool FusionPattern::isMergeable(FusionPattern& other) {
  if (!this->isFusible() || !other.isFusible()) return false;
  return true;
 }
 // Merges two fusion patterns and returns the merged pattern. The original
 // pattern remains unmodified.
 FusionPattern FusionPattern::merge(FusionPattern& other) {
  assert(isMergeable(other));
  FusionOpList new_op_list = op_list_;
  new_op_list.insert(new_op_list.end(), other.getOpList().begin(),
                     other.getOpList().end());
  FusionPattern new_fusion_pattern{new_op_list};
  FusionType newType = FusionType::kLoop;
  Operation* newDominant = getDominantOp();
  // kRowReduction + (kRowReduction | kColReduction | kLoop) = kRowReduction
  // kColReduction + (kColReduction | kLoop) = kColReduction
  // kLoop + kLoop = kLoop
  if (getFusionType() == FusionType::kRowReduction ||
      other.getFusionType() == FusionType::kRowReduction) {
    newType = FusionType::kRowReduction;
    if (getFusionType() != FusionType::kRowReduction)
      newDominant = other.getDominantOp();
  } else if (getFusionType() == FusionType::kColReduction ||
             other.getFusionType() == FusionType::kColReduction) {
    newType = FusionType::kColReduction;
    if (getFusionType() != FusionType::kColReduction)
      newDominant = other.getDominantOp();
  }
  new_fusion_pattern.setDominantOp(newDominant);
  new_fusion_pattern.setFusionType(newType);
  return new_fusion_pattern;
 }
 // Merges two fusion patterns and returns the merged pattern. Replaces the
 // original pattern with new merged pattern.
 FusionPattern& FusionPattern::mergeInplace(FusionPattern& other) {
  *this = merge(other);
  return *this;
 }
 // Returns the effective size (e.g. not counting const ops) of the ops this
 // fusion pattern contains.
 int FusionPattern::effectiveSize() {
  return llvm::count_if(
      op_list_, [](Operation* op) { return !matchPattern(op, m_Constant()); });
 }
 // Sorts the ops inside the fusion pattern according to the keys provided.
 void FusionPattern::sortFusionOpListBy(DenseMap<Operation*, int>& op_to_idx) {
  std::sort(op_list_.begin(), op_list_.end(),
            [&](Operation* lhs, Operation* rhs) {
              return op_to_idx[lhs] < op_to_idx[rhs];
            });
 }
 // Calculates the inputs and outputs of the fusion pattern.
 void FusionPattern::calculateOperandsAndResults() {
  DenseSet<Value> input_set;
  DenseSet<Value> result_set;
  DenseSet<Value> internal_result_set;
  DenseSet<Operation*> op_set(op_list_.begin(), op_list_.end());
  DenseMap<Value, Operation*> last_writer;
  for (Operation* op : op_list_) {
    int num_input_operand = op->getNumOperands() - getNumResultOperands(op);
    for (Value v : op->getOperands().drop_front(num_input_operand)) {
      bool inserted = last_writer.try_emplace(v, op).second;
      (void)inserted;
      assert(inserted);
      bool has_external_user = false;
      for (Operation* user : getValueUsers(v)) {
        if (!op_set.contains(user)) {
          has_external_user = true;
          break;
        }
      }
      if (has_external_user) {
        results_.push_back(v);
      } else {
        internal_results_.push_back(v);
      }
    }
  }
  for (Operation* op : op_list_) {
    int num_input_operand = op->getNumOperands() - getNumResultOperands(op);
    for (Value value : op->getOperands().take_front(num_input_operand)) {
      if (last_writer.find(value) != last_writer.end()) {
        // skip if defining op is in the pattern
        continue;
      }
      input_set.insert(value);
    }
  }
  for (Value v : input_set) operands_.push_back(v);
 }
 // Supports using EquivalenceClasses for Value
 bool operator<(const ValueWrapper& lhs, const ValueWrapper& rhs) {
  auto lhs_value = lhs.getValue().getAsOpaquePointer();
  auto rhs_value = rhs.getValue().getAsOpaquePointer();
  return lhs_value < rhs_value;
 }
 // shape equality propagation based on the shape constrains of
 // elementwise ops.
 void ShapeConstraintAnalysis::PropagateEquality(
    const SmallVectorImpl<Operation*>& op_list) {
  bool converged = true;
  do {
    converged = true;
    auto update = [&](Value lhs, Value rhs,
                      EquivalenceClasses<ValueWrapper>& impl) {
      if (!impl.isEquivalent(ValueWrapper(lhs), ValueWrapper(rhs))) {
        converged = false;
        impl.unionSets(ValueWrapper(lhs), ValueWrapper(rhs));
      }
    };
    for (Operation* op : op_list) {
      int num_operand = op->getNumOperands();
      // Propagates same num_elements equality, and shape equality
      if (isElementWise(op)) {
        Value lhs = op->getOperand(0);
        for (Value rhs : op->getOperands().drop_front()) {
          update(lhs, rhs, same_num_elements_impl_);
          update(lhs, rhs, same_shape_impl_);
        }
      }
      // Propagates same num_elements equality, not shape equality
      if (isa<lmhlo::DynamicReshapeOp, lmhlo::ReshapeOp, lmhlo::TransposeOp>(
              op)) {
        Value input = op->getOperand(0);
        // The last operand is the output memref by design
        Value output = op->getOperand(num_operand - 1);
        update(input, output, same_num_elements_impl_);
      }
    }
  } while (!converged);
 }
 }  // namespace lmhlo
 }  // namespace mlir
--- a/lib/Dialect/mhlo/transforms/lhlo_fusion.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_fusion.cc
@ -0,0 +1,570 @@
 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "mlir-hlo/Dialect/mhlo/transforms/PassDetail.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/fusion_utils.h"
 #include "mlir-hlo/utils/cycle_detector.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"      // TF:llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"              // TF:llvm-project
 #include "mlir/IR/Matchers.h"
 #include "mlir/Pass/Pass.h"               // TF:local_config_mlir
 #include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
 // This pass has similar functionality of the fusion pass in XLA stack.
 // However, unlike XLA, it targets the fully dynamic shape scenario.
 // Currently, it implements the kLoop and kInput fusion templates.
 // During conversion, it tries to greedily find kLoop/kInput fusion
 // patterns.
 //
 // Similar to XLA, this pass supports fusion pattern having multiple outputs
 // if all the shape of outputs are consistent. Following are some examples.
 //
 //        kLoop                          kInput
 // +----+  +----+  +----+    +----+    +----+    +----+
 // |elem|  |elem|  |elem|    |elem<----+elem+---->elem+----+
 // +-+--+  +-+--+  +-+--+    +-+--+    +----+    +-+--+    |
 //   |       |       |         |                   |       |
 //   |               |         |                   |       |
 // +-v--+    |     +-v--+   +--v---+            +--v---+   |
 // |elem+<---+----<+elem|   |reduce|            |reduce|   |
 // +-+--+          +-+--+   +--+---+            +--+---+   |
 //   |               |         |                   |       |
 //   |               |         |                   |       |
 //   v               v         v                   v       v
 //
 // To this end, we also add an simple shape constraint analysis phase.
 // For kLoop fusion template, it requires all the outputs of the fused
 // pattern have the same shape. However, we don't know the actual value
 // of the shape at the compile time in the dynamic shape world.
 // Fortunately, we could still infer the relationship among different ops
 // according to their shape constraint traits. Currently, We only consider
 // shape equality propagation for elementwise ops (assuming that implicit
 // shape broadcast is forbidden). The above process could be built on the
 // shape dialect once it is ready.
 //
 // TODO(disc): This file implements fusion on buffer level, re-visit this after
 // more shape inference/constraint infras are ready in mhlo level.
 // TODO(disc): Not using fusibility interface a.t.m, re-visit this if necessary.
 namespace mlir {
 namespace lmhlo {
 namespace {
 struct FusionOptions {
  // Maximum allowed number of arguments per fused kernel. Here arguments
  // include both ready-only buffers and writable buffers.
  int max_num_arguments_per_kernel;
 };
 // A fusion planner that can propose a fusion plan for a block of ops.
 // The fusion plan is consisted of a group of fusion patterns.
 //
 // Currently all proposed patterns followed xla kLoop/kInput like fusion
 // templates while are adapted to the fully dynamic shape world.
 //
 // kLoop fusion template satisfies:
 //   - all ops in the fusion pattern are element-wise.
 //   - all the shapes of outputs of fusion pattern are same or have same number
 //   of elements, and thus can fit into a same parallel loop.
 //
 // kInput fusion template satisfies:
 //   - any op in the fusion pattern is either element-wise or a reduction.
 //   - if a op is a reduction, its output cannot be consumed by other
 //     ops in the same fusion pattern.
 //   - all the effective shapes of outputs of fusion pattern are same.
 //     - For element-wise op, its effective shape is its output shape.
 //     - For reduction op, its effective shape is its operand shape.
 //   - currently our downstreaming codegen engine only support 2d -> 1d tensor
 //   reduction. TODO(disc): lift this limitation.
 //     - 2D row reduction: out[i] = sum({in[i][j] for all j})
 //     - 2D column reduction: out[j] = sum({in[i][j] for all i}
 class FusionPlanner {
 public:
  explicit FusionPlanner(const FusionOptions& options, Block* block)
      : options_(options), block_(block) {
    // Move up metadata-only ops (e.g. dim, shape_of) as far as possible.
    MoveUpMetadataOnlyOpsForFusion();
    for (Operation& op : *block) {
      op_list_.push_back(&op);
    }
    shape_analysis_.reset(new ShapeConstraintAnalysis(op_list_));
    cycle_detector_.reset(new GraphCycles(op_list_.size()));
    BuildNodeMap();
  }
  // Returns a fusion plan if success, otherwise none.
  llvm::Optional<FusionPlan> Run() {
    // Greedily search connected fusible pattern, and ops belonging to
    // a same fusion pattern are grouped into a cluster.
    RunEdgeContractionLoop();
    // After doing edge contraction, each unique cluster having size
    // more than one represents a potential fusion pattern.
    // We collect all these clusters and construct a fusion plan.
    FusionPlan plan;
    DenseSet<Cluster*> seen_clusters;
    for (Operation* op : op_list_) {
      Cluster* cluster = GetClusterForNode(op);
      if (!seen_clusters.insert(cluster).second) continue;
      FusionPattern& fusion_pattern = cluster->fused_pattern();
      // Make sure the ops in a fusion pattern are in topological ordering.
      fusion_pattern.sortFusionOpListBy(op_to_node_id_);
      if (!fusion_pattern.isFusible() || fusion_pattern.effectiveSize() <= 1) {
        continue;
      }
      plan.emplace_back(fusion_pattern);
    }
    // Re-order ops inside the blocks to make sure all producers are placed
    // before its consumers after fusion.
    ReorderOperationsInsideBlock();
    return plan;
  }
  // Returns the op_list this planner operates on.
  const SmallVectorImpl<Operation*>& op_list() const { return op_list_; }
 private:
  // Represent a (partial) fused pattern
  class Cluster {
   public:
    Cluster(int node_id, FusionPlanner* planner)
        : node_id_(node_id), pattern_(planner->op_list()[node_id]) {}
    // Merges `other` into this cluster, and clears `other`.
    void Merge(Cluster* other) {
      pattern_.mergeInplace(other->fused_pattern());
    }
    // The number of nodes in this cluster.
    int cluster_size() { return pattern_.size(); }
    // The ID of the cluster as represented in `cycle_detector_`.
    int cycles_graph_node_id() const { return node_id_; }
    // Sets the ID of the cluster as represented in `cycle_detector_`.
    void set_cycles_graph_node_id(int cycles_graph_node_id) {
      node_id_ = cycles_graph_node_id;
    }
    // Currently the fused pattern this cluster holds.
    FusionPattern& fused_pattern() { return pattern_; }
   private:
    // ID of the representative node of this cluster.
    int node_id_;
    // the fused pattern this cluster holds.
    FusionPattern pattern_;
  };
 private:
  // Returns a new cluster with specified `cycles_graph_node_id`
  Cluster* MakeCluster(int cycles_graph_node_id) {
    cluster_storage_.emplace_back(new Cluster(cycles_graph_node_id, this));
    return cluster_storage_.back().get();
  }
  // Metadata ops (e.g. shapeOf, dimOp) don't change data thus we move forward
  // them as far as possible inside the same block to enable more fusion
  // opportunities.
  void MoveUpMetadataOnlyOpsForFusion() {
    SmallVector<Operation*, 4> ops;
    for (Operation& op : *block_) {
      ops.push_back(&op);
    }
    auto inBlock = [&](Operation* op, Block* block) {
      return op && op->getBlock() == block;
    };
    for (Operation* op : ops) {
      Block* block = op->getBlock();
      if (isa<shape::ShapeOfOp>(op)) {
        Operation* definingOp = op->getOperand(0).getDefiningOp();
        if (!inBlock(definingOp, block)) {
          op->moveBefore(block, block->begin());
        } else {
          op->moveAfter(definingOp);
        }
      } else if (isa<memref::DimOp>(op)) {
        Operation* firstOperandOp = op->getOperand(0).getDefiningOp();
        Operation* secondOperandOp = op->getOperand(1).getDefiningOp();
        if (!inBlock(firstOperandOp, block) &&
            !inBlock(secondOperandOp, block)) {
          op->moveBefore(block, block->begin());
        } else if (!inBlock(firstOperandOp, block)) {
          op->moveAfter(secondOperandOp);
        } else if (!inBlock(secondOperandOp, block)) {
          op->moveAfter(firstOperandOp);
        } else if (firstOperandOp->isBeforeInBlock(secondOperandOp)) {
          op->moveAfter(secondOperandOp);
        } else {
          op->moveAfter(firstOperandOp);
        }
      }
    }
  }
  // Returns all the values touched by this op or its nested ops.
  SmallVector<Value, 4> GetAllPossibleUsedValues(Operation* op) {
    SmallVector<Value, 4> values;
    op->walk([&](Operation* nest_op) {
      for (Value v : nest_op->getOperands()) {
        values.push_back(v);
      }
    });
    return values;
  }
  // Builds the initial dependency graph.
  void BuildNodeMap() {
    int num_nodes = op_list_.size();
    for (int node_id = 0; node_id < num_nodes; ++node_id) {
      Operation* op = op_list_[node_id];
      MakeCluster(node_id);
      op_to_node_id_[op] = node_id;
      leader_for_node_.insert(node_id);
      for (Value operand : GetAllPossibleUsedValues(op)) {
        Operation* operand_op = FindLastWriter(operand);
        // Only consider the operand_op inside the target block.
        auto iter = op_to_node_id_.find(operand_op);
        if (iter == op_to_node_id_.end()) {
          continue;
        }
        // Add an edge to connect the last writer and the current consumer.
        cycle_detector_->InsertEdge(iter->second, node_id);
      }
      // For some ops (e.g. lmhlo ops), some operands are the output memrefs
      // Thus these operands are supposed to be updated.
      // Suppose that a op (or its nested ops) can only write the buffers
      // explicit passed in as operands of this op.
      int num_input_operand = op->getNumOperands() - getNumResultOperands(op);
      for (Value v : op->getOperands().drop_front(num_input_operand)) {
        auto it = last_writer_.try_emplace(v, op);
        (void)it;
        // Currently, a buffer is only supposed to be written once (as the
        // output operand of one lmhlo op).
        assert(it.second);
      }
    }
  }
  // Returns the cluster contains this op.
  Cluster* GetClusterForNode(Operation* n) {
    int id = op_to_node_id_[n];
    id = leader_for_node_.getLeaderValue(id);
    return cluster_storage_[id].get();
  }
  // Returns the cluster contains the op having `node_id`.
  Cluster* GetClusterForCyclesGraphNode(int node_id) {
    return cluster_storage_[leader_for_node_.getLeaderValue(node_id)].get();
  }
  // Merges the clusters `cluster_from` and `cluster_to`.
  bool MergeClusters(Cluster* cluster_from, Cluster* cluster_to) {
    int from = cluster_from->cycles_graph_node_id();
    int to = cluster_to->cycles_graph_node_id();
    auto optional_merged_node = cycle_detector_->ContractEdge(from, to);
    if (!optional_merged_node.hasValue()) {
      llvm::dbgs() << "Could not contract " << from << " -> " << to
                   << " because contracting the edge would create a cycle.";
      return false;
    }
    // Merge the clusters.
    cluster_from->Merge(cluster_to);
    cluster_from->set_cycles_graph_node_id(*optional_merged_node);
    // Merge the UnionFind Set.
    leader_for_node_.unionSets(from, to);
    return true;
  }
  using FnTy = llvm::function_ref<bool(Cluster*, Cluster*)>;
  bool ForEachEdgeInPostOrder(FnTy fn, bool enable_cross_fusion = false) {
    bool changed = false;
    for (int32_t node : cycle_detector_->AllNodesInPostOrder()) {
      Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
      // Make a copy of the set of successors because we may modify the graph in
      // TryToContractEdge.
      std::vector<int32_t> successors_copy =
          cycle_detector_->SuccessorsCopy(cluster_from->cycles_graph_node_id());
      for (int to : successors_copy) {
        Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
        bool contracted_edge = fn(cluster_from, cluster_to);
        changed |= contracted_edge;
      }
    }
    if (!enable_cross_fusion) return changed;
    // To enable even more fusion opportunities (e.g. horizontal fusion)
    for (int32_t lhs : cycle_detector_->AllNodesInPostOrder()) {
      Cluster* cluster_lhs = GetClusterForCyclesGraphNode(lhs);
      if (!cluster_lhs) {
        continue;
      }
      for (int32_t rhs : cycle_detector_->AllNodesInPostOrder()) {
        Cluster* cluster_rhs = GetClusterForCyclesGraphNode(rhs);
        if (!cluster_rhs || cluster_lhs == cluster_rhs) {
          continue;
        }
        bool contracted_edge = fn(cluster_lhs, cluster_rhs);
        changed |= contracted_edge;
      }
    }
    return changed;
  }
  // This function check if fusing `from` with `to` is valid and if so perform
  // the merge. The validity is based on the operations in the clusters and
  // the compatibility of the shapes of the outputs of the would-be fused
  // clusters.
  // Returns true is the merge was performed.
  bool TryToContractEdge(Cluster* from, Cluster* to) {
    // Try merge and check if valid.
    if (!from->fused_pattern().isMergeable(to->fused_pattern())) return false;
    FusionPattern fused_pattern =
        from->fused_pattern().merge(to->fused_pattern());
    auto& op_list = fused_pattern.getOpList();
    auto& operands = fused_pattern.getOperands();
    auto& results = fused_pattern.getResults();
    if (results.size() + operands.size() >
        options_.max_num_arguments_per_kernel) {
      // some backend devices (e.g. GPU) do not support a kernel with
      // too many arguments.
      return false;
    }
    // We currently do not support a constant op as final output of a fusion
    // pattern.
    // TODO(disc): copy small const in case necessary.
    for (Value result : results) {
      Operation* result_op = FindLastWriter(result);
      assert(result_op);
      if (isa<lmhlo::ConstOp>(result_op)) {
        return false;
      }
    }
    // ReduceOp can not have consumer within the fusion pattern.
    for (Operation* op : op_list) {
      if (!isa<lmhlo::ReduceOp>(op)) continue;
      int num_input_operand = op->getNumOperands() - getNumResultOperands(op);
      for (Value v : op->getOperands().drop_front(num_input_operand)) {
        for (Operation* user : getValueUsers(v)) {
          if (user == op) continue;
          if (std::find(op_list.begin(), op_list.end(), user) !=
              op_list.end()) {
            return false;
          }
        }
      }
    }
    // All outputs of a fusion pattern should have compatible shape.
    // Here `compatible` means:
    // - if `to` and `from` are both kInput fusion, all output should have same
    // shape.
    // - otherwise, all output should have same number of elements.
    // No outside users, these ops may be eliminated. We fused it here and let
    // latter pass to do such DCE.
    if (results.empty()) return true;
    bool check_same_shape = (to->fused_pattern().isKInputFusion() &&
                             from->fused_pattern().isKInputFusion());
    auto get_effective_shape = [&](Value v) {
      auto result_op = FindLastWriter(v);
      assert(result_op);
      // effective shape of reduce op is its operand's shape.
      return isa<lmhlo::ReduceOp>(result_op) ? result_op->getOperand(0) : v;
    };
    Value ref_shape = get_effective_shape(results[0]);
    if (!llvm::all_of(results, [&](Value result) {
          Value shape = get_effective_shape(result);
          return check_same_shape
                     ? shape_analysis_->HasSameShape(ref_shape, shape)
                     : shape_analysis_->HasSameNumElements(ref_shape, shape);
        })) {
      return false;
    }
    return MergeClusters(from, to);
  }
  // Greedily fuse connected node.
  bool RunEdgeContractionLoop() {
    using std::placeholders::_1;
    using std::placeholders::_2;
    bool changed = false;
    // Run fusion pass repeatedly until nothing to be fused
    while (ForEachEdgeInPostOrder(
        std::bind(&FusionPlanner::TryToContractEdge, this, _1, _2), false)) {
      // empty statement by design
    }
    return changed;
  }
  // Here `value` is supported to be a pointer to buffer.
  // Returns the defining op of `value `if no known op updates the buffer,
  // otherwise returns the last op that updates the buffer pointed by the
  // `value`.
  Operation* FindLastWriter(Value value) {
    auto it = last_writer_.find(value);
    if (it != last_writer_.end()) {
      return it->second;
    }
    return value.getDefiningOp();
  }
  // Re-order ops inside the block to make sure that producers are before
  // consumers after fusion.
  void ReorderOperationsInsideBlock() {
    auto reorder_func = [&](Cluster* from, Cluster* to) {
      FusionPattern& from_pattern = from->fused_pattern();
      FusionPattern& to_pattern = to->fused_pattern();
      Operation* last_op_in_from = from_pattern.getOpList().back();
      for (Operation* op : llvm::reverse(to_pattern.getOpList())) {
        if (!last_op_in_from->isBeforeInBlock(op))
          op->moveAfter(last_op_in_from);
      }
      return false;
    };
    ForEachEdgeInPostOrder(reorder_func);
  }
  // hyper-parameters that controls the behaviour of the fusion planner.
  FusionOptions options_;
  // The block that fusion planner works on.
  Block* block_;
  // Ops inside the block
  SmallVector<Operation*, 4> op_list_;
  // Shape equality checker
  std::unique_ptr<ShapeConstraintAnalysis> shape_analysis_;
  // op -> node_id
  DenseMap<Operation*, int> op_to_node_id_;
  // make sure not introduce cycle after fusion
  std::unique_ptr<GraphCycles> cycle_detector_;
  std::vector<std::unique_ptr<Cluster>> cluster_storage_;
  // a UnionFind set. Each set represents a (partial) fused pattern
  // and has a leader as representation.
  EquivalenceClasses<int32_t> leader_for_node_;
  // Here `value` is supported to be a pointer to buffer.
  // Returns the defining op of `value `if no known op updates the buffer,
  // otherwise returns the last op that updates the buffer pointed by the
  // `value`.
  DenseMap<Value, Operation*> last_writer_;
 };
 struct LhloFusionPass : public LhloFusionPassBase<LhloFusionPass> {
  using LhloFusionPassBase<LhloFusionPass>::LhloFusionPassBase;
  explicit LhloFusionPass(int max_num_arguments_per_kernel)
      : LhloFusionPassBase<LhloFusionPass>::LhloFusionPassBase() {
    this->max_num_arguments_per_kernel_ = max_num_arguments_per_kernel;
  }
  void runOnFunction() override {
    FuncOp func = getFunction();
    // collect all blocks inside the function.
    SmallVector<Block*, 4> blocks;
    CollectBlocksInsideFunction(func, blocks);
    // process each block and do fusion within a block.
    FusionOptions options;
    options.max_num_arguments_per_kernel = max_num_arguments_per_kernel_;
    for (Block* block : blocks) {
      FusionPlanner planner(options, block);
      llvm::Optional<FusionPlan> plan = planner.Run();
      if (!plan) {
        emitError(func.getLoc(),
                  "an error occurs while trying to find fusion candidates");
        signalPassFailure();
        return;
      }
      if (!ApplyFusionPlan(*plan)) {
        emitError(func.getLoc(), "apply fusion plan failed");
        signalPassFailure();
        return;
      }
    }
  }
  bool ApplyFusionPlan(FusionPlan& plan) {
    for (FusionPattern& pattern : plan) {
      auto& op_list = pattern.getOpList();
      OpBuilder b(op_list.back());
      // Get the fused locations
      SmallVector<Location, 4> locations;
      locations.reserve(op_list.size());
      for (Operation* op : op_list) {
        locations.push_back(op->getLoc());
      }
      Location fused_loc =
          FusedLoc::get(op_list.back()->getContext(), locations);
      // Move ops inside fusion pattern to the region attached to the fusion op.
      FusionOp fusion = b.create<lmhlo::FusionOp>(fused_loc);
      Region& region = fusion.region();
      Block& block = region.front();
      for (Operation* op : llvm::reverse(op_list)) {
        op->moveBefore(&block, block.begin());
      }
    }
    return true;
  }
  void CollectBlocksInsideFunction(FuncOp op, SmallVectorImpl<Block*>& blocks) {
    op.walk([&](Block* block) {
      // It does not make sense to fuse the region attached to these ops.
      if (!isa<lmhlo::ReduceOp, lmhlo::FusionOp>(block->getParentOp()))
        blocks.push_back(block);
    });
  }
 };
 }  // namespace
 std::unique_ptr<OperationPass<FuncOp>> createLhloFusionPass(
    int max_num_arguments_per_kernel) {
  return std::make_unique<LhloFusionPass>(max_num_arguments_per_kernel);
 }
 }  // namespace lmhlo
 }  // namespace mlir
--- a/tests/lhlo-fusion.mlir
+++ b/tests/lhlo-fusion.mlir
@ -0,0 +1,286 @@
 // RUN: mlir-hlo-opt --lhlo-fusion -split-input-file %s -o - | FileCheck %s
 // CHECK-LABEL: @simple_kloop_fusion
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?x?xf32>, %[[ARG3:.*]]: memref<?x?xf32>) -> memref<?x?xf32>
 func @simple_kloop_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
                          %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) -> memref<?x?xf32> {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.add"(%[[ARG1]], %[[ARG2]], %[[ARG3]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG3]] : memref<?x?xf32>
  "lmhlo.abs"(%arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.add"(%arg1, %arg2, %arg3) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  return %arg3 : memref<?x?xf32>
 }
 // -----
 // CHECK-LABEL: @simple_multi_output_kloop_fusion
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?x?xf32>, %[[ARG3:.*]]: memref<?x?xf32>) -> (memref<?x?xf32>, memref<?x?xf32>)
 func @simple_multi_output_kloop_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
                          %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) -> (memref<?x?xf32>, memref<?x?xf32>) {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.add"(%[[ARG1]], %[[ARG2]], %[[ARG3]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG1]], %[[ARG3]] : memref<?x?xf32>, memref<?x?xf32>
  "lmhlo.abs"(%arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.add"(%arg1, %arg2, %arg3) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  return %arg1, %arg3 : memref<?x?xf32>, memref<?x?xf32>
 }
 // -----
 // CHECK-LABEL: @simple_multi_output_kloop_fusion_with_reorder
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?x?xf32>, %[[ARG3:.*]]: memref<?x?xf32>, %[[ARG4:.*]]: memref<2xindex>, %[[ARG5:.*]]: memref<?x?xf32>)
 func @simple_multi_output_kloop_fusion_with_reorder(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
                          %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>,
                          %arg4: memref<2xindex>, %arg5:  memref<?x?xf32>) -> (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.add"(%[[ARG1]], %[[ARG2]], %[[ARG3]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: }) : () -> ()
  // CHECK: "lmhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[ARG4]], %[[ARG5]])
  // CHECK: return %[[ARG1]], %[[ARG3]], %[[ARG5]] : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
  "lmhlo.abs"(%arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.dynamic_broadcast_in_dim"(%arg1, %arg4, %arg5) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (memref<?x?xf32>, memref<2xindex>, memref<?x?xf32>) -> ()
  "lmhlo.add"(%arg1, %arg2, %arg3) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  return %arg1, %arg3, %arg5 : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
 }
 // -----
 // CHECK-LABEL: @same_num_elements_multi_output_kloop_fusion
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<2xi64>, %[[ARG3:.*]]: memref<?x?x?xf32>, %[[ARG4:.*]]: memref<?x?x?xf32>, %[[ARG5:.*]]: memref<?x?x?xf32>)
 func @same_num_elements_multi_output_kloop_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
                          %arg2: memref<2xi64>, %arg3: memref<?x?x?xf32>,
                          %arg4: memref<?x?x?xf32>, %arg5:  memref<?x?x?xf32>) -> (memref<?x?xf32>, memref<?x?x?xf32>) {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.dynamic_reshape"(%[[ARG1]], %[[ARG2]], %[[ARG3]])
  // CHECK: "lmhlo.add"(%[[ARG3]], %[[ARG4]], %[[ARG5]]) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG1]], %[[ARG5]] : memref<?x?xf32>, memref<?x?x?xf32>
  "lmhlo.abs"(%arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.dynamic_reshape"(%arg1, %arg2, %arg3) : (memref<?x?xf32>, memref<2xi64>, memref<?x?x?xf32>) -> ()
  "lmhlo.add"(%arg3, %arg4, %arg5) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
  return %arg1, %arg5 : memref<?x?xf32>, memref<?x?x?xf32>
 }
 // -----
 // CHECK-LABEL: @check_not_kloop_fusion
 func @check_not_kloop_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>, %arg3: memref<?x?xf32>) -> (memref<?x?xf32>, memref<?x?xf32>) {
  // CHECK-NOT: "lmhlo.fusion"
  "lmhlo.add"(%arg0, %arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.subtract"(%arg2, %arg2, %arg3) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  return %arg1, %arg3: memref<?x?xf32>, memref<?x?xf32>
 }
 // -----
 // CHECK-LABEL: @kloop_fusion_with_dealloc
 // CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>)
 func @kloop_fusion_with_dealloc(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) -> (memref<?x?xf32>, memref<?x?xf32>) {
  // CHECK: %[[TMP3:.*]] = memref.alloc
  // CHECK: %[[TMP5:.*]] = memref.alloc
  // CHECK: %[[TMP9:.*]] = memref.alloc
  // CHECK: %[[TMP13:.*]] = memref.alloc
  // CHECK: %[[TMP16:.*]] = memref.alloc
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[TMP3]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.multiply"(%[[ARG0]], %[[ARG1]], %[[TMP5]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.abs"(%[[TMP3]], %[[TMP9]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.abs"(%[[TMP5]], %[[TMP13]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.multiply"(%[[TMP9]], %[[TMP13]], %[[TMP16]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: }) : () -> ()
  // CHECK: memref.dealloc %[[TMP3]] : memref<?x?xf32>
  // CHECK: memref.dealloc %[[TMP5]] : memref<?x?xf32>
  // CHECK: memref.dealloc %[[TMP13]] : memref<?x?xf32>
  // CHECK: return %[[TMP9]], %[[TMP16]] : memref<?x?xf32>, memref<?x?xf32>
  %c0 = constant 0 : index
  %c1 = constant 1 : index
  %0 = shape.shape_of %arg0 : memref<?x?xf32> -> tensor<2xindex>
  %1 = tensor.extract %0[%c0] : tensor<2xindex>
  %2 = tensor.extract %0[%c1] : tensor<2xindex>
  %3 = memref.alloc(%1, %2) : memref<?x?xf32>
  "lmhlo.add"(%arg0, %arg1, %3) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  %4 = memref.alloc(%1, %2) : memref<?x?xf32>
  "lmhlo.multiply"(%arg0, %arg1, %4) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  %5 = shape.shape_of %3 : memref<?x?xf32> -> tensor<2xindex>
  %6 = tensor.extract %5[%c0] : tensor<2xindex>
  %7 = tensor.extract %5[%c1] : tensor<2xindex>
  %8 = memref.alloc(%6, %7) : memref<?x?xf32>
  "lmhlo.abs"(%3, %8) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  memref.dealloc %3 : memref<?x?xf32>
  %9 = shape.shape_of %4 : memref<?x?xf32> -> tensor<2xindex>
  %10 = tensor.extract %9[%c0] : tensor<2xindex>
  %11 = tensor.extract %9[%c1] : tensor<2xindex>
  %12 = memref.alloc(%10, %11) : memref<?x?xf32>
  "lmhlo.abs"(%4, %12) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  memref.dealloc %4 : memref<?x?xf32>
  %13 = shape.shape_of %8 : memref<?x?xf32> -> tensor<2xindex>
  %14 = tensor.extract %13[%c0] : tensor<2xindex>
  %15 = tensor.extract %13[%c1] : tensor<2xindex>
  %16 = memref.alloc(%14, %15) : memref<?x?xf32>
  "lmhlo.multiply"(%8, %12, %16) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  memref.dealloc %12 : memref<?x?xf32>
  return %8, %16 : memref<?x?xf32>, memref<?x?xf32>
 }
 // -----
 // CHECK-LABEL: @simple_kinput
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?xf32>, %[[ARG3:.*]]: memref<f32>
 func @simple_kinput(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?xf32>, %init: memref<f32>) -> memref<?xf32> {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.reduce"(%[[ARG1]], %[[ARG3]], %[[ARG2]]) ( {
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG2]] : memref<?xf32>
  "lmhlo.abs"(%arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.reduce"(%arg1, %init, %arg2) ( {
  ^bb0(%targ1: memref<f32>, %targ2: memref<f32>, %tresult: memref<f32>):
    "lmhlo.add"(%targ1, %targ2, %tresult) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  } ) {dimensions = dense<[0]> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  return %arg2: memref<?xf32>
 }
 // -----
 // CHECK-LABEL: @multi_output_kinput
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?xf32>, %[[ARG3:.*]]: memref<f32>
 func @multi_output_kinput(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?xf32>, %init: memref<f32>) -> (memref<?x?xf32>, memref<?xf32>) {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.abs"(%[[ARG0]], %[[ARG1]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.reduce"(%[[ARG1]], %[[ARG3]], %[[ARG2]]) ( {
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG1]], %[[ARG2]] : memref<?x?xf32>, memref<?xf32>
  "lmhlo.abs"(%arg0, %arg1) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.reduce"(%arg1, %init, %arg2) ( {
  ^bb0(%targ1: memref<f32>, %targ2: memref<f32>, %tresult: memref<f32>):
    "lmhlo.add"(%targ1, %targ2, %tresult) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  } ) {dimensions = dense<[0]> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  return %arg1, %arg2: memref<?x?xf32>, memref<?xf32>
 }
 // -----
 // CHECK-LABEL: @row_red_and_row_red_kinput
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?x?xf32>, %[[ARG3:.*]]: memref<?xf32>, %[[ARG4:.*]]: memref<?xf32>, %[[ARG5:.*]]: memref<?x?xf32>, %[[ARG6:.*]]: memref<f32>
 func @row_red_and_row_red_kinput(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>, %arg5: memref<?x?xf32>, %init: memref<f32>) -> (memref<?xf32>, memref<?xf32>) {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.abs"(%[[ARG2]], %[[ARG5]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.reduce"(%[[ARG5]], %[[ARG6]], %[[ARG3]]) ( {
  // CHECK: "lmhlo.reduce"(%[[ARG2]], %[[ARG6]], %[[ARG4]]) ( {
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG3]], %[[ARG4]] : memref<?xf32>, memref<?xf32>
  "lmhlo.add"(%arg0, %arg1, %arg2) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.abs"(%arg2, %arg5) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.reduce"(%arg5, %init, %arg3) ( {
  ^bb0(%targ1: memref<f32>, %targ2: memref<f32>, %tresult: memref<f32>):
    "lmhlo.add"(%targ1, %targ2, %tresult) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  } ) {dimensions = dense<[1]> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  "lmhlo.reduce"(%arg2, %init, %arg4) ( {
  ^bb0(%targ1: memref<f32>, %targ2: memref<f32>, %tresult: memref<f32>):
    "lmhlo.add"(%targ1, %targ2, %tresult) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  } ) {dimensions = dense<[1]> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  return %arg3, %arg4: memref<?xf32>, memref<?xf32>
 }
 // -----
 // CHECK-LABEL: @row_red_and_col_red_kinput
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>, %[[ARG2:.*]]: memref<?x?xf32>, %[[ARG3:.*]]: memref<?xf32>, %[[ARG4:.*]]: memref<?xf32>, %[[ARG5:.*]]: memref<?x?xf32>, %[[ARG6:.*]]: memref<f32>
 func @row_red_and_col_red_kinput(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>, %arg3: memref<?xf32>, %arg4: memref<?xf32>, %arg5: memref<?x?xf32>, %init: memref<f32>) -> (memref<?xf32>, memref<?xf32>) {
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.abs"(%[[ARG2]], %[[ARG5]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.reduce"(%[[ARG5]], %[[ARG6]], %[[ARG3]]) ( {
  // CHECK: "lmhlo.reduce"(%[[ARG2]], %[[ARG6]], %[[ARG4]]) ( {
  // CHECK: }) : () -> ()
  // CHECK: return %[[ARG3]], %[[ARG4]] : memref<?xf32>, memref<?xf32>
  "lmhlo.add"(%arg0, %arg1, %arg2) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.abs"(%arg2, %arg5) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
  "lmhlo.reduce"(%arg5, %init, %arg3) ( {
  ^bb0(%targ1: memref<f32>, %targ2: memref<f32>, %tresult: memref<f32>):
    "lmhlo.add"(%targ1, %targ2, %tresult) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  } ) {dimensions = dense<[1]> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  "lmhlo.reduce"(%arg2, %init, %arg4) ( {
  ^bb0(%targ1: memref<f32>, %targ2: memref<f32>, %tresult: memref<f32>):
    "lmhlo.add"(%targ1, %targ2, %tresult) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  } ) {dimensions = dense<[0]> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  return %arg3, %arg4: memref<?xf32>, memref<?xf32>
 }
 // -----
 // CHECK-LABEL: @reduce_should_not_have_consumer_in_the_fusion
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<?x?xf32>
 func @reduce_should_not_have_consumer_in_the_fusion(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>)
 -> (memref<?x?xf32>, memref<?xf32>) {
  // CHECK: %[[TMP4:.*]] = memref.alloc
  // CHECK: %[[TMP7:.*]] = memref.alloc
  // CHECK: %[[TMP8:.*]] = memref.alloc
  // CHECK: %[[TMP9:.*]] = memref.alloc
  // CHECK: "lmhlo.fusion"() ( {
  // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[TMP4]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.subtract"(%[[ARG0]], %[[TMP4]], %[[TMP7]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  // CHECK: "lmhlo.constant"(%[[TMP8]]) {value = dense<0.000000e+00> : tensor<f32>} : (memref<f32>) -> ()
  // CHECK: "lmhlo.reduce"(%[[TMP7]], %[[TMP8]], %[[TMP9]]) ( {
  // CHECK: }) : () -> ()
  // CHECK: memref.dealloc %[[TMP4]] : memref<?x?xf32>
  // CHECK: memref.dealloc %[[TMP8]] : memref<f32>
  // CHECK: %[[TMP12:.*]] = memref.alloc
  // CHECK: "lmhlo.add"(%[[TMP9]], %[[TMP9]], %[[TMP12]]) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
  // CHECK: memref.dealloc %[[TMP9]] : memref<?xf32>
  // CHECK: return %[[TMP7]], %[[TMP12]] : memref<?x?xf32>, memref<?xf32>
  %c1 = constant 1 : index
  %c0 = constant 0 : index
  %0 = shape.shape_of %arg0 : memref<?x?xf32> -> tensor<2xindex>
  %1 = tensor.extract %0[%c0] : tensor<2xindex>
  %2 = tensor.extract %0[%c1] : tensor<2xindex>
  %3 = memref.alloc(%1, %2) : memref<?x?xf32>
  "lmhlo.add"(%arg0, %arg1, %3) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  %4 = shape.shape_of %arg0 : memref<?x?xf32> -> tensor<2xindex>
  %5 = tensor.extract %4[%c0] : tensor<2xindex>
  %6 = tensor.extract %4[%c1] : tensor<2xindex>
  %7 = memref.alloc(%5, %6) : memref<?x?xf32>
  "lmhlo.subtract"(%arg0, %3, %7) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  memref.dealloc %3 : memref<?x?xf32>
  %8 = memref.alloc() : memref<f32>
  "lmhlo.constant"(%8) {value = dense<0.000000e+00> : tensor<f32>} : (memref<f32>) -> ()
  %9 = memref.alloc(%5) : memref<?xf32>
  "lmhlo.reduce"(%7, %8, %9) ( {
  ^bb0(%arg2: memref<f32>, %arg3: memref<f32>, %arg4: memref<f32>):  // no predecessors
    "lmhlo.add"(%arg2, %arg3, %arg4) : (memref<f32>, memref<f32>, memref<f32>) -> ()
    "lmhlo.terminator"() : () -> ()
  }) {dimensions = dense<1> : tensor<1xi64>} : (memref<?x?xf32>, memref<f32>, memref<?xf32>) -> ()
  memref.dealloc %8 : memref<f32>
  %10 = shape.shape_of %9 : memref<?xf32> -> tensor<1xindex>
  %11 = tensor.extract %10[%c0] : tensor<1xindex>
  %12 = memref.alloc(%11) : memref<?xf32>
  "lmhlo.add"(%9, %9, %12) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
  memref.dealloc %9 : memref<?xf32>
  return %7, %12 : memref<?x?xf32>, memref<?xf32>
 }
 // -----
 // CHECK-LABEL: @const_should_not_be_output
 func @const_should_not_be_output(%arg0: memref<f32>) -> (memref<f32>, memref<f32>) {
  // CHECK-NOT: lmhlo.fusion
  %0 = memref.alloc() : memref<f32>
  "lmhlo.constant"(%0) {value = dense<1.000000e+00> : tensor<f32>} : (memref<f32>) -> ()
  %1 = memref.alloc() : memref<f32>
  "lmhlo.add"(%arg0, %0, %1) : (memref<f32>, memref<f32>, memref<f32>) -> ()
  return %0, %1 : memref<f32>, memref<f32>
 }