mlir-hlo/tests/rank-specialization.mlir

// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster | FileCheck %s
// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster --mhlo-rank-specialization-to-scf | FileCheck %s --check-prefix CHECK-SCF

// CHECK-LABEL: @add_mul
// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
func @add_mul(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>,
    %arg2 : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG0]], %[[ARG1]]) ( {
  // CHECK: ^bb0(%[[ARG2_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>):
  // CHECK:   %[[TMP:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[ARG1_]]
  // CHECK:   %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[ARG2_]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[INNER_RES]])
  // CHECK: }) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  // CHECK: return %[[RES]]
  %0 = chlo.broadcast_multiply %arg0, %arg1
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %1 = chlo.broadcast_add %0, %arg2
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  return %1 : tensor<*xf32>
}

// -----

// Unary MHLO operation.
// CHECK-LABEL: @sqrt
// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
func @sqrt(%arg : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])
  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>):
  // CHECK:   %[[TMP0:.*]] = "mhlo.sqrt"(%[[ARG_]])
  // CHECK:   %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]])
  // CHECK:   %[[TMP2:.*]] = "mhlo.sqrt"(%[[TMP1]])
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
  // CHECK: return %[[RES]]
  %0 = "mhlo.sqrt"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
  %1 = "mhlo.sqrt"(%0) : (tensor<*xf32>) -> tensor<*xf32>
  %2 = "mhlo.sqrt"(%1) : (tensor<*xf32>) -> tensor<*xf32>
  return %2 : tensor<*xf32>
}

// CHECK-SCF-LABEL: @sqrt
// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
// CHECK-SCF:       %[[FLAT_ARG:.*]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
// CHECK-SCF:       %[[TMP0:.*]] = "mhlo.sqrt"(%[[FLAT_ARG]]) : (tensor<?xf32>)
// CHECK-SCF:       %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]]) : (tensor<?xf32>)
// CHECK-SCF:       %[[TMP2:.*]] = "mhlo.sqrt"(%[[TMP1]]) : (tensor<?xf32>)
// CHECK-SCF:       %[[RES:.*]] = "mhlo.dynamic_reshape"(%[[TMP2]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
// CHECK-SCF:       return %[[RES]]

// -----

// Don't cluster single ranked operation.
// CHECK-LABEL: @sqrt_ranked
// CHECK-SAME: (%[[ARG:.*]]: tensor<3x?xf32>)
func @sqrt_ranked(%arg: tensor<3x?xf32>) -> tensor<3x?xf32> {
  // CHECK-NOT: rank_specialization_cluster
  %0 = "mhlo.sqrt"(%arg) : (tensor<3x?xf32>) -> tensor<3x?xf32>
  %1 = "mhlo.sqrt"(%0) : (tensor<3x?xf32>) -> tensor<3x?xf32>
  %2 = "mhlo.sqrt"(%1) : (tensor<3x?xf32>) -> tensor<3x?xf32>
  return %2 : tensor<3x?xf32>
}

// -----

// Ternary operation.
// CHECK-LABEL: @select_mixed
// CHECK-SAME: (%[[PRED:.*]]: tensor<*xi1>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<2xf32>)
func @select_mixed(%pred: tensor<*xi1>, %arg1: tensor<*xf32>,
    %arg2: tensor<2xf32>)  -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[PRED]], %[[ARG1]], %[[ARG2]])
  // CHECK: ^bb0(%[[PRED_:.*]]: tensor<*xi1>, %[[ARG1_:.*]]: tensor<*xf32>, %[[ARG2_:.*]]: tensor<2xf32>)
  // CHECK:   %[[TMP:.*]] = chlo.broadcast_select %[[PRED_]], %[[ARG1_]], %[[ARG2_]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
  // CHECK: return %[[RES]]
  %0 = "chlo.broadcast_select"(%pred, %arg1, %arg2)
      : (tensor<*xi1>, tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
  return %0 : tensor<*xf32>
}

// -----

// Unary CHLO operation.
// CHECK-LABEL: @tan
// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
func @tan(%arg : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]]) ( {
  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>)
  // CHECK:   %[[TMP0:.*]] = chlo.tan %[[ARG_]]
  // CHECK:   %[[TMP1:.*]] = chlo.tan %[[TMP0]]
  // CHECK:   %[[TMP2:.*]] = chlo.tan %[[TMP1]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
  // CHECK: return %[[RES]]
  %0 = chlo.tan %arg : tensor<*xf32> -> tensor<*xf32>
  %1 = chlo.tan %0 : tensor<*xf32> -> tensor<*xf32>
  %2 = chlo.tan %1 : tensor<*xf32> -> tensor<*xf32>
  return %2 : tensor<*xf32>
}

// -----

// Composition of unary/binary CHLO and unary MHLO ops.
// CHECK-LABEL: @mixed
// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
func @mixed(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>)
    -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG1]], %[[ARG0]])
  // CHECK: ^bb0(%[[ARG2_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>)
  // CHECK:   %[[TMP0:.*]] = chlo.tan %[[ARG0_]]
  // CHECK:   %[[TMP1:.*]] = "mhlo.sqrt"(%[[ARG1_]])
  // CHECK:   %[[TMP2:.*]] = chlo.broadcast_multiply %[[TMP0]], %[[TMP1]]
  // CHECK:   %[[TMP3:.*]] = chlo.broadcast_add %[[TMP2]], %[[ARG2_]]
  // CHECK:   %[[TMP4:.*]] = "mhlo.sqrt"(%[[TMP3]])
  // CHECK:   %[[TMP5:.*]] = chlo.tan %[[TMP4]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP5]])
  // CHECK: return %[[RES]]
  %0 = chlo.tan %arg0 : tensor<*xf32> -> tensor<*xf32>
  %1 = "mhlo.sqrt"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
  %2 = chlo.broadcast_multiply %0, %1
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %3 = chlo.broadcast_add %2, %arg2
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %4 = "mhlo.sqrt"(%3) : (tensor<*xf32>) -> tensor<*xf32>
  %5 = chlo.tan %4 : tensor<*xf32> -> tensor<*xf32>
  return %5 : tensor<*xf32>
}
[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00			`// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster \| FileCheck %s`
[MLIR][HLO] Add `rank-specialization-to-scf` pass Currently the lowering is only implemented for the unary case. The n-ary case will follow. PiperOrigin-RevId: 374162772 2021-05-17 18:55:32 +08:00			`// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster --mhlo-rank-specialization-to-scf \| FileCheck %s --check-prefix CHECK-SCF`
[MLIR][HLO] Add `rank-specialization-cluster` pass Add a pass to cluster unranked C/HLO operations in one `chlo.rank_specialization_cluster` op. The C/HLO operations are moved to the body of the operation. Later passes can use this to rank-specialize all these operations together. PiperOrigin-RevId: 373336725 2021-05-12 18:45:09 +08:00
			`// CHECK-LABEL: @add_mul`
			`// CHECK-SAME: (%[[ARG0:.]]: tensor<xf32>, %[[ARG1:.]]: tensor<xf32>, %[[ARG2:.]]: tensor<xf32>)`
			`func @add_mul(%arg0 : tensor<xf32>, %arg1 : tensor<xf32>,`
			`%arg2 : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG0]], %[[ARG1]]) ( {`
			`// CHECK: ^bb0(%[[ARG2_:.]]: tensor<xf32>, %[[ARG0_:.]]: tensor<xf32>, %[[ARG1_:.]]: tensor<xf32>):`
			`// CHECK: %[[TMP:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[ARG1_]]`
			`// CHECK: %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[ARG2_]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[INNER_RES]])`
			`// CHECK: }) : (tensor<xf32>, tensor<xf32>, tensor<xf32>) -> tensor<xf32>`
			`// CHECK: return %[[RES]]`
			`%0 = chlo.broadcast_multiply %arg0, %arg1`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%1 = chlo.broadcast_add %0, %arg2`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`return %1 : tensor<*xf32>`
			`}`
[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00
			`// -----`

			`// Unary MHLO operation.`
			`// CHECK-LABEL: @sqrt`
			`// CHECK-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`func @sqrt(%arg : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])`
			`// CHECK: ^bb0(%[[ARG_:.]]: tensor<xf32>):`
			`// CHECK: %[[TMP0:.*]] = "mhlo.sqrt"(%[[ARG_]])`
			`// CHECK: %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]])`
			`// CHECK: %[[TMP2:.*]] = "mhlo.sqrt"(%[[TMP1]])`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP2]])`
			`// CHECK: return %[[RES]]`
			`%0 = "mhlo.sqrt"(%arg) : (tensor<xf32>) -> tensor<xf32>`
			`%1 = "mhlo.sqrt"(%0) : (tensor<xf32>) -> tensor<xf32>`
			`%2 = "mhlo.sqrt"(%1) : (tensor<xf32>) -> tensor<xf32>`
			`return %2 : tensor<*xf32>`
			`}`

[MLIR][HLO] Add `rank-specialization-to-scf` pass Currently the lowering is only implemented for the unary case. The n-ary case will follow. PiperOrigin-RevId: 374162772 2021-05-17 18:55:32 +08:00			`// CHECK-SCF-LABEL: @sqrt`
			`// CHECK-SCF-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`// CHECK-SCF: %[[SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[N:.*]] = shape.num_elements %[[SHAPE]]`
			`// CHECK-SCF: %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]`
			`// CHECK-SCF: %[[FLAT_ARG:.]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<xf32>, tensor<1xindex>) -> tensor<?xf32>`
			`// CHECK-SCF: %[[TMP0:.*]] = "mhlo.sqrt"(%[[FLAT_ARG]]) : (tensor<?xf32>)`
			`// CHECK-SCF: %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]]) : (tensor<?xf32>)`
			`// CHECK-SCF: %[[TMP2:.*]] = "mhlo.sqrt"(%[[TMP1]]) : (tensor<?xf32>)`
			`// CHECK-SCF: %[[RES:.]] = "mhlo.dynamic_reshape"(%[[TMP2]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<xf32>`
			`// CHECK-SCF: return %[[RES]]`

[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00			`// -----`

			`// Don't cluster single ranked operation.`
			`// CHECK-LABEL: @sqrt_ranked`
			`// CHECK-SAME: (%[[ARG:.*]]: tensor<3x?xf32>)`
			`func @sqrt_ranked(%arg: tensor<3x?xf32>) -> tensor<3x?xf32> {`
			`// CHECK-NOT: rank_specialization_cluster`
			`%0 = "mhlo.sqrt"(%arg) : (tensor<3x?xf32>) -> tensor<3x?xf32>`
			`%1 = "mhlo.sqrt"(%0) : (tensor<3x?xf32>) -> tensor<3x?xf32>`
			`%2 = "mhlo.sqrt"(%1) : (tensor<3x?xf32>) -> tensor<3x?xf32>`
			`return %2 : tensor<3x?xf32>`
			`}`
[MLIR][HLO] Allow rank specialization clustering with `chlo.broadcast_select` op PiperOrigin-RevId: 373379990 2021-05-12 23:55:45 +08:00
			`// -----`

			`// Ternary operation.`
			`// CHECK-LABEL: @select_mixed`
			`// CHECK-SAME: (%[[PRED:.]]: tensor<xi1>, %[[ARG1:.]]: tensor<xf32>, %[[ARG2:.*]]: tensor<2xf32>)`
			`func @select_mixed(%pred: tensor<xi1>, %arg1: tensor<xf32>,`
			`%arg2: tensor<2xf32>) -> tensor<*xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[PRED]], %[[ARG1]], %[[ARG2]])`
			`// CHECK: ^bb0(%[[PRED_:.]]: tensor<xi1>, %[[ARG1_:.]]: tensor<xf32>, %[[ARG2_:.*]]: tensor<2xf32>)`
			`// CHECK: %[[TMP:.*]] = chlo.broadcast_select %[[PRED_]], %[[ARG1_]], %[[ARG2_]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP]])`
			`// CHECK: return %[[RES]]`
			`%0 = "chlo.broadcast_select"(%pred, %arg1, %arg2)`
			`: (tensor<xi1>, tensor<xf32>, tensor<2xf32>) -> tensor<*xf32>`
			`return %0 : tensor<*xf32>`
			`}`
[MLIR][HLO] Support CHLO unary operations in rank specialization clustering PiperOrigin-RevId: 373397321 2021-05-13 01:20:03 +08:00
			`// -----`

			`// Unary CHLO operation.`
			`// CHECK-LABEL: @tan`
			`// CHECK-SAME: (%[[ARG:.]]: tensor<xf32>) -> tensor<*xf32>`
			`func @tan(%arg : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]]) ( {`
			`// CHECK: ^bb0(%[[ARG_:.]]: tensor<xf32>)`
			`// CHECK: %[[TMP0:.*]] = chlo.tan %[[ARG_]]`
			`// CHECK: %[[TMP1:.*]] = chlo.tan %[[TMP0]]`
			`// CHECK: %[[TMP2:.*]] = chlo.tan %[[TMP1]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP2]])`
			`// CHECK: return %[[RES]]`
			`%0 = chlo.tan %arg : tensor<xf32> -> tensor<xf32>`
			`%1 = chlo.tan %0 : tensor<xf32> -> tensor<xf32>`
			`%2 = chlo.tan %1 : tensor<xf32> -> tensor<xf32>`
			`return %2 : tensor<*xf32>`
			`}`
[MLIR][HLO] Add mixed test for `rank-specialization-cluster` pass PiperOrigin-RevId: 373762814 2021-05-14 19:38:10 +08:00
			`// -----`

			`// Composition of unary/binary CHLO and unary MHLO ops.`
			`// CHECK-LABEL: @mixed`
			`// CHECK-SAME: (%[[ARG0:.]]: tensor<xf32>, %[[ARG1:.]]: tensor<xf32>, %[[ARG2:.]]: tensor<xf32>)`
			`func @mixed(%arg0 : tensor<xf32>, %arg1 : tensor<xf32>, %arg2 : tensor<*xf32>)`
			`-> tensor<*xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG1]], %[[ARG0]])`
			`// CHECK: ^bb0(%[[ARG2_:.]]: tensor<xf32>, %[[ARG1_:.]]: tensor<xf32>, %[[ARG0_:.]]: tensor<xf32>)`
			`// CHECK: %[[TMP0:.*]] = chlo.tan %[[ARG0_]]`
			`// CHECK: %[[TMP1:.*]] = "mhlo.sqrt"(%[[ARG1_]])`
			`// CHECK: %[[TMP2:.*]] = chlo.broadcast_multiply %[[TMP0]], %[[TMP1]]`
			`// CHECK: %[[TMP3:.*]] = chlo.broadcast_add %[[TMP2]], %[[ARG2_]]`
			`// CHECK: %[[TMP4:.*]] = "mhlo.sqrt"(%[[TMP3]])`
			`// CHECK: %[[TMP5:.*]] = chlo.tan %[[TMP4]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP5]])`
			`// CHECK: return %[[RES]]`
			`%0 = chlo.tan %arg0 : tensor<xf32> -> tensor<xf32>`
			`%1 = "mhlo.sqrt"(%arg1) : (tensor<xf32>) -> tensor<xf32>`
			`%2 = chlo.broadcast_multiply %0, %1`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%3 = chlo.broadcast_add %2, %arg2`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%4 = "mhlo.sqrt"(%3) : (tensor<xf32>) -> tensor<xf32>`
			`%5 = chlo.tan %4 : tensor<xf32> -> tensor<xf32>`
			`return %5 : tensor<*xf32>`
			`}`