Extend fusion root heuristic to also work in partially bufferized programs.

We now follow data flow though tensor_cast, tensor_load and tensor_to_memref operations. PiperOrigin-RevId: 342851104
2020-11-17 06:34:11 -08:00 · 2020-11-17 06:34:11 -08:00 · 0c7152e65c
parent d9113e0b4d
commit 0c7152e65c
2 changed files with 80 additions and 0 deletions
--- a/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
+++ b/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
@ -91,6 +91,31 @@ class LhloFuseLinalgPass
        if (result_buffers.insert(alias).second) {
          worklist.push_back(alias);
        }
        continue;
      }
      if (auto tensor_load = dyn_cast<TensorLoadOp>(definingOp)) {
        auto alias = tensor_load.memref();
        if (result_buffers.insert(alias).second) {
          worklist.push_back(alias);
        }
        continue;
      }
      if (auto tensor_to_memref = dyn_cast<TensorToMemrefOp>(definingOp)) {
        auto alias = tensor_to_memref.tensor();
        if (result_buffers.insert(alias).second) {
          worklist.push_back(alias);
        }
        continue;
      }
      if (auto tensor_cast = dyn_cast<TensorCastOp>(definingOp)) {
        auto alias = tensor_cast.source();
        if (result_buffers.insert(alias).second) {
          worklist.push_back(alias);
        }
        continue;
      }
      if (auto regionInterface =
--- a/tests/lhlo-fuse-linalg.mlir
+++ b/tests/lhlo-fuse-linalg.mlir
@ -372,3 +372,58 @@ func @branching_result(%arg0: memref<?xf32>, %arg1: memref<?xindex>, %arg2: inde
 //       PLOOP:  else
 //       PLOOP:    memref_reshape
 //       PLOOP:    scf.yield
 // -----
 // Confirm that tiling information is passed through tensor_load, tensor_cast
 // and memref_to_tensor  operations.
 func @tensor_ops(%arg0: memref<32xf32>, %arg1: memref<32xindex>)
    -> memref<?xf32> {
  %c1 = constant 1 : index
  %1 = alloc() : memref<32xf32>
  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                   affine_map<(d0) -> (d0)>],
                  iterator_types = ["parallel"]}
      ins(%arg0 : memref<32xf32>) outs(%1 : memref<32xf32>) {
  ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
    %13 = absf %arg3 : f32
    linalg.yield %13 : f32
  }
  %2 = tensor_load %1 : memref<32xf32>
  %3 = tensor_cast %2 : tensor<32xf32> to tensor<?xf32>
  %4 = tensor_to_memref %3 : memref<?xf32>
  return %4 : memref<?xf32>
 }
 // CHECK-LABEL: func @tensor_ops
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
 //       CHECK:  scf.for {{.*}} step %[[C1]]
 //   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        absf
 //       CHECK:  tensor_load
 //       CHECK:  tensor_cast
 //       CHECK:  tensor_to_memref
 // TILED-LABEL: func @tensor_ops
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-NOT:  linalg.generic
 //       TILED:  scf.for {{.*}} step %[[C2]]
 //   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        absf
 //       TILED:  tensor_load
 //       TILED:  tensor_cast
 //       TILED:  tensor_to_memref
 // PLOOP-LABEL: func @tensor_ops
 //   PLOOP-NOT:  linalg.generic
 //       PLOOP:  scf.parallel
 //   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        absf
 //       PLOOP:  tensor_load
 //       PLOOP:  tensor_cast
 //       PLOOP:  tensor_to_memref