From 1ba4c714c93e3f4f78960e7099994ded9c5d371d Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Fri, 21 May 2021 12:16:14 -0700
Subject: [PATCH] Add support for lowering mhlo.scatter ops to Linalg.

This only works for updating tensors, not add/min/max computations. It requires
the index depth to be 1 because of the limitation in Linalg. We can not compare
multiple indices without packing indices.

PiperOrigin-RevId: 375137721
---
 .../mhlo/transforms/legalize_to_linalg.cc     | 97 +++++++++++++++++++
 tests/hlo-legalize-to-linalg.mlir             | 80 +++++++++++++++
 2 files changed, 177 insertions(+)
diff --git a/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index 24afdb3..5c8184d 100644
--- a/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -2091,6 +2091,102 @@ struct TorchIndexSelectOpOnTensorsConversion
   }
 };
 
+struct ScatterUpdateOnTensorsConversion
+    : public OpConversionPattern<mhlo::ScatterOp> {
+  using OpConversionPattern<mhlo::ScatterOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::ScatterOp op, ArrayRef<Value> args,
+      ConversionPatternRewriter& rewriter) const final {
+    mhlo::ScatterOp::Adaptor adaptor(args);
+
+    // Check if it is a tensor_scatter_nd_update-like op.
+    auto& body_ops = op.getRegion().front().getOperations();
+    if (body_ops.size() != 1) return failure();
+    auto ret_arg = body_ops.front().getOperand(0).dyn_cast<BlockArgument>();
+    if (!ret_arg || ret_arg.getArgNumber() != 1) return failure();
+
+    auto operand_ty = adaptor.operand().getType().dyn_cast<RankedTensorType>();
+    auto indices_ty =
+        adaptor.scatter_indices().getType().dyn_cast<RankedTensorType>();
+    if (!operand_ty || !indices_ty) return failure();
+
+    // Linalg operations put all the computation to the innermost loop. Since we
+    // also iterate over scatter_indices() with some loops, we can only check
+    // one scatter index in one iteration. If there are multiple indices (ie,
+    // the index depth is greater than 1), we don't have a way to keep the
+    // comparison state. E.g., if the index_depth is 2, like indices = [[0, 1]],
+    // we should use the update value only if (i == 0 and j == 1). However, we
+    // can not get both indices in one iteration unless we pack them together.
+    auto index_vector_dim =
+        op.scatter_dimension_numbers().index_vector_dim().getInt();
+    if (indices_ty.getDimSize(index_vector_dim) != 1)
+      return rewriter.notifyMatchFailure(op, "require index depth to be 1");
+    if (index_vector_dim != indices_ty.getRank() - 1) {
+      return rewriter.notifyMatchFailure(
+          op, "require index_vector_dim to be the last dim");
+    }
+
+    // One of indices dims is index depth vector.
+    int64_t nloops = operand_ty.getRank() + indices_ty.getRank() - 1;
+    SmallVector<AffineMap, 3> indexing_maps;
+    {
+      SmallVector<AffineExpr> exprs;
+      for (int64_t i = 0, e = operand_ty.getRank(); i < e; ++i)
+        exprs.push_back(rewriter.getAffineDimExpr(i));
+      indexing_maps.push_back(AffineMap::get(nloops, /*symbolCount=*/0, exprs,
+                                             rewriter.getContext()));
+    }
+    {
+      SmallVector<AffineExpr> exprs;
+      for (int64_t i = operand_ty.getRank(); i < nloops; ++i)
+        exprs.push_back(rewriter.getAffineDimExpr(i));
+      // The index depth is 1.
+      exprs.push_back(rewriter.getAffineConstantExpr(0));
+      indexing_maps.push_back(AffineMap::get(nloops, /*symbolCount=*/0, exprs,
+                                             rewriter.getContext()));
+
+      exprs.pop_back();
+      auto update_window_dims =
+          Extract1DVector(op.scatter_dimension_numbers().update_window_dims());
+      for (auto d : update_window_dims)
+        exprs.push_back(rewriter.getAffineDimExpr(d));
+      indexing_maps.push_back(AffineMap::get(nloops, /*symbolCount=*/0, exprs,
+                                             rewriter.getContext()));
+    }
+    indexing_maps.push_back(indexing_maps.front());
+
+    auto result_ty = this->typeConverter->convertType(op.getResult().getType())
+                         .cast<ShapedType>();
+    auto scatter_dims_to_operand_dims = Extract1DVector(
+        op.scatter_dimension_numbers().scatter_dims_to_operand_dims());
+    assert(scatter_dims_to_operand_dims.size() == 1);
+    // Do not need init_tensor because we'd like to initialize the output as
+    // operand.
+    auto linalg_op = rewriter.create<linalg::GenericOp>(
+        op.getLoc(), /*resultTensors=*/ArrayRef<Type>{result_ty},
+        /*inputs=*/
+        ValueRange{adaptor.operand(), adaptor.scatter_indices(),
+                   adaptor.updates()},
+        /*outputs=*/adaptor.operand(), indexing_maps,
+        GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& b, Location loc, ValueRange args) {
+          Value cmp_idx =
+              b.create<linalg::IndexOp>(loc, scatter_dims_to_operand_dims[0]);
+          Value idx = b.create<IndexCastOp>(loc, b.getIndexType(), args[1]);
+          Value pred = b.create<CmpIOp>(loc, b.getI1Type(), CmpIPredicate::eq,
+                                        cmp_idx, idx);
+          // Use the output arg, so some update values won't be init value
+          // again.
+          Value res = b.create<SelectOp>(loc, args[2].getType(), pred, args[2],
+                                         args[3]);
+          b.create<linalg::YieldOp>(loc, res);
+        });
+    rewriter.replaceOp(op, linalg_op.getResults());
+    return success();
+  }
+};
+
 void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                                            TypeConverter& typeConverter,
                                            OwningRewritePatternList* patterns) {
@@ -2353,6 +2449,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
       DepthwiseConvOpOnTensorsConversion,
       ReduceOnTensorsConversion,
       ReduceWindowOpOnTensorsConversion,
+      ScatterUpdateOnTensorsConversion,
       TorchIndexSelectOpOnTensorsConversion,
       PadOpOnTensorsConversion>(type_converter, context);
   // clang-format on
diff --git a/tests/hlo-legalize-to-linalg.mlir b/tests/hlo-legalize-to-linalg.mlir
index 955187e..f46cc97 100644
--- a/tests/hlo-legalize-to-linalg.mlir
+++ b/tests/hlo-legalize-to-linalg.mlir
@@ -2379,3 +2379,83 @@ func @unsigned_compare(%lhs: tensor<2x2xui32>, %rhs: tensor<2x2xui32>) -> tensor
   %0 = "mhlo.compare"(%lhs, %rhs) {comparison_direction = "GT"} : (tensor<2x2xui32>, tensor<2x2xui32>) -> tensor<2x2xi1>
   return %0 : tensor<2x2xi1>
 }
+
+// -----
+
+func @scatter_update_scalar(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>,
+                            %arg2: tensor<1xi32>) -> tensor<3xi32> {
+  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ( {
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):  // no predecessors
+    "mhlo.return"(%arg4) : (tensor<i32>) -> ()
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = {
+      index_vector_dim = 1 : i64,
+      inserted_window_dims = dense<0> : tensor<1xi64>,
+      scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+      update_window_dims = dense<> : tensor<0xi64>
+    },
+    unique_indices = false
+  } : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  return %0 : tensor<3xi32>
+}
+// CHECK-DAG:   #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0)>
+// CHECK-DAG:   #[[MAP1:.*]] = affine_map<(d0, d1) -> (d1, 0)>
+// CHECK-DAG:   #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1)>
+// CHECK:       func @scatter_update_scalar
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]*]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9_]*]]
+// CHECK-SAME:    %[[ARG2:[a-zA-Z0-9_]*]]
+// CHECK:         %[[RES:.*]] = linalg.generic
+// CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP0]]]
+// CHECK-SAME:      iterator_types = ["parallel", "parallel"]
+// CHECK-SAME:      ins(%[[ARG0]], %[[ARG1]], %[[ARG2]] : tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>)
+// CHECK-SAME:     outs(%[[ARG0]] : tensor<3xi32>) {
+// CHECK:         ^bb0(%{{.*}}: i32, %[[IDX_I32:.*]]: i32, %[[UPDATE:.*]]: i32, %[[OUT:.*]]: i32):  // no predecessors
+// CHECK:           %[[CMP_IDX:.*]] = linalg.index 0 : index
+// CHECK:           %[[IDX:.*]] = index_cast %[[IDX_I32]] : i32 to index
+// CHECK:           %[[PRED:.*]] = cmpi eq, %[[CMP_IDX]], %[[IDX]] : index
+// CHECK:           %[[SELECT:.*]] = select %[[PRED]], %[[UPDATE]], %[[OUT]] : i32
+// CHECK:           linalg.yield %[[SELECT]] : i32
+// CHECK:         } -> tensor<3xi32>
+// CHECK:         return %[[RES]] : tensor<3xi32>
+
+// -----
+
+func @scatter_update_slice(%arg0: tensor<6x3xi32>, %arg1: tensor<2x1xi32>,
+                           %arg2: tensor<2x3xi32>) -> tensor<6x3xi32> {
+  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ( {
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):  // no predecessors
+    "mhlo.return"(%arg4) : (tensor<i32>) -> ()
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = {
+      index_vector_dim = 1 : i64,
+      inserted_window_dims = dense<0> : tensor<1xi64>,
+      scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+      update_window_dims = dense<1> : tensor<1xi64>
+    },
+    unique_indices = false
+  } : (tensor<6x3xi32>, tensor<2x1xi32>, tensor<2x3xi32>) -> tensor<6x3xi32>
+  return %0 : tensor<6x3xi32>
+}
+// CHECK-DAG:   #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-DAG:   #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d2, 0)>
+// CHECK-DAG:   #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK:       func @scatter_update_slice
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]*]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9_]*]]
+// CHECK-SAME:    %[[ARG2:[a-zA-Z0-9_]*]]
+// CHECK:         %[[RES:.*]] = linalg.generic
+// CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP0]]]
+// CHECK-SAME:      iterator_types = ["parallel", "parallel", "parallel"]
+// CHECK-SAME:      ins(%[[ARG0]], %[[ARG1]], %[[ARG2]] : tensor<6x3xi32>, tensor<2x1xi32>, tensor<2x3xi32>)
+// CHECK-SAME:     outs(%[[ARG0]] : tensor<6x3xi32>) {
+// CHECK:         ^bb0(%{{.*}}: i32, %[[IDX_I32:.*]]: i32, %[[UPDATE:.*]]: i32, %[[OUT:.*]]: i32):  // no predecessors
+// CHECK:           %[[CMP_IDX:.*]] = linalg.index 0 : index
+// CHECK:           %[[IDX:.*]] = index_cast %[[IDX_I32]] : i32 to index
+// CHECK:           %[[PRED:.*]] = cmpi eq, %[[CMP_IDX]], %[[IDX]] : index
+// CHECK:           %[[SELECT:.*]] = select %[[PRED]], %[[UPDATE]], %[[OUT]] : i32
+// CHECK:           linalg.yield %[[SELECT]] : i32
+// CHECK:         } -> tensor<6x3xi32>
+// CHECK:         return %[[RES]] : tensor<6x3xi32>