mlir-hlo/tests/rank-specialization.mlir

// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster | FileCheck %s
// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster --mhlo-rank-specialization-to-scf | FileCheck %s --check-prefix CHECK-SCF

// CHECK-LABEL: @add_mul
// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
func @add_mul(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>,
    %arg2 : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG0]], %[[ARG1]]) ( {
  // CHECK: ^bb0(%[[ARG2_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>):
  // CHECK:   %[[TMP:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[ARG1_]]
  // CHECK:   %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[ARG2_]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[INNER_RES]])
  // CHECK: }) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  // CHECK: return %[[RES]]
  %0 = chlo.broadcast_multiply %arg0, %arg1
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %1 = chlo.broadcast_add %0, %arg2
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  return %1 : tensor<*xf32>
}

// -----

// Unary MHLO operation.
// CHECK-LABEL: @sqrt
// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
func @sqrt(%arg : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])
  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>):
  // CHECK:   %[[TMP0:.*]] = "mhlo.sqrt"(%[[ARG_]])
  // CHECK:   %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]])
  // CHECK:   %[[TMP2:.*]] = "mhlo.sqrt"(%[[TMP1]])
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
  // CHECK: return %[[RES]]
  %0 = "mhlo.sqrt"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
  %1 = "mhlo.sqrt"(%0) : (tensor<*xf32>) -> tensor<*xf32>
  %2 = "mhlo.sqrt"(%1) : (tensor<*xf32>) -> tensor<*xf32>
  return %2 : tensor<*xf32>
}

// CHECK-SCF-LABEL: @sqrt
// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
// CHECK-SCF:       %[[FLAT_ARG:.*]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
// CHECK-SCF:       %[[TMP0:.*]] = "mhlo.sqrt"(%[[FLAT_ARG]]) : (tensor<?xf32>)
// CHECK-SCF:       %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]]) : (tensor<?xf32>)
// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = "mhlo.sqrt"(%[[TMP1]]) : (tensor<?xf32>)
// CHECK-SCF:       %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[RES:.*]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
// CHECK-SCF:       return %[[RES]]

// -----

// Don't cluster ranked operations.
// CHECK-LABEL: @sqrt_ranked
// CHECK-SAME: (%[[ARG:.*]]: tensor<3x?xf32>)
func @sqrt_ranked(%arg: tensor<3x?xf32>) -> tensor<3x?xf32> {
  // CHECK-NOT: rank_specialization_cluster
  %0 = "mhlo.sqrt"(%arg) : (tensor<3x?xf32>) -> tensor<3x?xf32>
  %1 = "mhlo.sqrt"(%0) : (tensor<3x?xf32>) -> tensor<3x?xf32>
  %2 = "mhlo.sqrt"(%1) : (tensor<3x?xf32>) -> tensor<3x?xf32>
  return %2 : tensor<3x?xf32>
}

// -----

// Ternary operation.
// CHECK-LABEL: @select_mixed
// CHECK-SAME: (%[[PRED:.*]]: tensor<*xi1>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<2xf32>)
func @select_mixed(%pred: tensor<*xi1>, %arg1: tensor<*xf32>,
    %arg2: tensor<2xf32>)  -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[PRED]], %[[ARG1]], %[[ARG2]])
  // CHECK: ^bb0(%[[PRED_:.*]]: tensor<*xi1>, %[[ARG1_:.*]]: tensor<*xf32>, %[[ARG2_:.*]]: tensor<2xf32>)
  // CHECK:   %[[TMP:.*]] = chlo.broadcast_select %[[PRED_]], %[[ARG1_]], %[[ARG2_]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
  // CHECK: return %[[RES]]
  %0 = "chlo.broadcast_select"(%pred, %arg1, %arg2)
      : (tensor<*xi1>, tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
  return %0 : tensor<*xf32>
}

// -----

// Unary CHLO operation.
// CHECK-LABEL: @tan
// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
func @tan(%arg : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]]) ( {
  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>)
  // CHECK:   %[[TMP0:.*]] = chlo.tan %[[ARG_]]
  // CHECK:   %[[TMP1:.*]] = chlo.tan %[[TMP0]]
  // CHECK:   %[[TMP2:.*]] = chlo.tan %[[TMP1]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
  // CHECK: return %[[RES]]
  %0 = chlo.tan %arg : tensor<*xf32> -> tensor<*xf32>
  %1 = chlo.tan %0 : tensor<*xf32> -> tensor<*xf32>
  %2 = chlo.tan %1 : tensor<*xf32> -> tensor<*xf32>
  return %2 : tensor<*xf32>
}

// CHECK-SCF-LABEL: @tan
// CHECK-SCF-SAME: (%[[ARG:.*]]: tensor<*xf32>)
// CHECK-SCF:      %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:      %[[N:.*]] = shape.num_elements %[[SHAPE]]
// CHECK-SCF:      %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
// CHECK-SCF:      %[[FLAT_ARG:.*]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
// CHECK-SCF:      %[[TMP0:.*]] = chlo.tan %[[FLAT_ARG]] : tensor<?xf32>
// CHECK-SCF:      %[[TMP1:.*]] = chlo.tan %[[TMP0]] : tensor<?xf32>
// CHECK-SCF:      %[[UNSHAPED_RES:.*]] = chlo.tan %[[TMP1]] : tensor<?xf32>
// CHECK-SCF:      %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:      %[[RES:.*]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
// CHECK-SCF:      return %[[RES]]

// -----

// Composition of unary/binary CHLO and unary MHLO ops.
// CHECK-LABEL: @mixed
// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
func @mixed(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>)
    -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG1]], %[[ARG0]])
  // CHECK: ^bb0(%[[ARG2_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>)
  // CHECK:   %[[TMP0:.*]] = chlo.tan %[[ARG0_]]
  // CHECK:   %[[TMP1:.*]] = "mhlo.sqrt"(%[[ARG1_]])
  // CHECK:   %[[TMP2:.*]] = chlo.broadcast_multiply %[[TMP0]], %[[TMP1]]
  // CHECK:   %[[TMP3:.*]] = chlo.broadcast_add %[[TMP2]], %[[ARG2_]]
  // CHECK:   %[[TMP4:.*]] = "mhlo.sqrt"(%[[TMP3]])
  // CHECK:   %[[TMP5:.*]] = chlo.tan %[[TMP4]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP5]])
  // CHECK: return %[[RES]]
  %0 = chlo.tan %arg0 : tensor<*xf32> -> tensor<*xf32>
  %1 = "mhlo.sqrt"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
  %2 = chlo.broadcast_multiply %0, %1
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %3 = chlo.broadcast_add %2, %arg2
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %4 = "mhlo.sqrt"(%3) : (tensor<*xf32>) -> tensor<*xf32>
  %5 = chlo.tan %4 : tensor<*xf32> -> tensor<*xf32>
  return %5 : tensor<*xf32>
}

// -----

// Constant cluster operand.
// CHECK-LABEL: @relu
// CHECK-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
func @relu(%arg : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]], %[[C0]])
  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>, %[[C0_:.*]]: tensor<f32>):
  // CHECK:   %[[TMP:.*]] = chlo.broadcast_maximum %[[ARG_]], %[[C0_]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
  // CHECK: return %[[RES]]
  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
  %1 = chlo.broadcast_maximum %0, %arg
      : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
  return %1 : tensor<*xf32>
}

// CHECK-SCF-LABEL: @relu
// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
// CHECK-SCF:       %[[C0:.*]] = mhlo.constant dense<0.000000e+00>
// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
// CHECK-SCF:       %[[FLAT_ARG:.*]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = chlo.broadcast_maximum %[[FLAT_ARG]], %[[C0]] : (tensor<?xf32>, tensor<f32>)
// CHECK-SCF:       %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[RES:.*]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
// CHECK-SCF:       return %[[RES]]

// -----

// Cluster with binary non-broadcasting operation.
// CHECK-LABEL: @angle
// CHECK-SAME:  (%[[ARG:.*]]: tensor<*xcomplex<f32>>)
func @angle(%arg : tensor<*xcomplex<f32>>) -> tensor<*xf32> {
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])
  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xcomplex<f32>>):
  // CHECK:   %[[IMAG:.*]] = "mhlo.imag"(%[[ARG_]])
  // CHECK:   %[[REAL:.*]] = "mhlo.real"(%[[ARG_]])
  // CHECK:   %[[TMP:.*]] = mhlo.atan2 %[[IMAG]], %[[REAL]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
  // CHECK: return %[[RES]]
  %0 = "mhlo.imag"(%arg) : (tensor<*xcomplex<f32>>) -> tensor<*xf32>
  %1 = "mhlo.real"(%arg) : (tensor<*xcomplex<f32>>) -> tensor<*xf32>
  %2 = mhlo.atan2 %0, %1 : tensor<*xf32>
  return %2 : tensor<*xf32>
}

// CHECK-SCF-LABEL: @angle
// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xcomplex<f32>>)
// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
// CHECK-SCF:       %[[FLAT_ARG:.*]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<*xcomplex<f32>>, tensor<1xindex>) -> tensor<?xcomplex<f32>>
// CHECK-SCF:       %[[IMAG:.*]] = "mhlo.imag"(%[[FLAT_ARG]]) : (tensor<?xcomplex<f32>>)
// CHECK-SCF:       %[[REAL:.*]] = "mhlo.real"(%[[FLAT_ARG]]) : (tensor<?xcomplex<f32>>)
// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = mhlo.atan2 %[[IMAG]], %[[REAL]] : tensor<?xf32>
// CHECK-SCF:       %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]
// CHECK-SCF:       %[[RES:.*]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
// CHECK-SCF:       return %[[RES]]

// -----

// CHECK-LABEL: @xlogy
// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
func @xlogy(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*xf32> {
  // CHECK: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>
  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[C0]], %[[ARG0]], %[[ARG1]])
  // CHECK: ^bb0(%[[C0_:.*]]: tensor<f32>, %[[ARG0_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>):
  // CHECK:   %[[TMP0:.*]] = chlo.broadcast_compare %[[ARG0_]], %[[C0_]] {comparison_direction = "EQ"}
  // CHECK:   %[[TMP1:.*]] = "mhlo.log"(%[[ARG1_]])
  // CHECK:   %[[TMP2:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[TMP1]]
  // CHECK:   %[[TMP3:.*]] = chlo.broadcast_select %[[TMP0]], %[[C0_]], %[[TMP2]]
  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP3]])
  // CHECK: return %[[RES]]
  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
  %1 = tensor.cast %0 : tensor<f32> to tensor<f32>
  %2 = chlo.broadcast_compare %arg0, %1 {comparison_direction = "EQ"}
      : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
  %3 = "mhlo.log"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
  %4 = chlo.broadcast_multiply %arg0, %3
      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
  %5 = chlo.broadcast_select %2, %1, %4
      : (tensor<*xi1>, tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
  return %5 : tensor<*xf32>
}
[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00			`// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster \| FileCheck %s`
[MLIR][HLO] Add `rank-specialization-to-scf` pass Currently the lowering is only implemented for the unary case. The n-ary case will follow. PiperOrigin-RevId: 374162772 2021-05-17 18:55:32 +08:00			`// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster --mhlo-rank-specialization-to-scf \| FileCheck %s --check-prefix CHECK-SCF`
[MLIR][HLO] Add `rank-specialization-cluster` pass Add a pass to cluster unranked C/HLO operations in one `chlo.rank_specialization_cluster` op. The C/HLO operations are moved to the body of the operation. Later passes can use this to rank-specialize all these operations together. PiperOrigin-RevId: 373336725 2021-05-12 18:45:09 +08:00
			`// CHECK-LABEL: @add_mul`
			`// CHECK-SAME: (%[[ARG0:.]]: tensor<xf32>, %[[ARG1:.]]: tensor<xf32>, %[[ARG2:.]]: tensor<xf32>)`
			`func @add_mul(%arg0 : tensor<xf32>, %arg1 : tensor<xf32>,`
			`%arg2 : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG0]], %[[ARG1]]) ( {`
			`// CHECK: ^bb0(%[[ARG2_:.]]: tensor<xf32>, %[[ARG0_:.]]: tensor<xf32>, %[[ARG1_:.]]: tensor<xf32>):`
			`// CHECK: %[[TMP:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[ARG1_]]`
			`// CHECK: %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[ARG2_]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[INNER_RES]])`
			`// CHECK: }) : (tensor<xf32>, tensor<xf32>, tensor<xf32>) -> tensor<xf32>`
			`// CHECK: return %[[RES]]`
			`%0 = chlo.broadcast_multiply %arg0, %arg1`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%1 = chlo.broadcast_add %0, %arg2`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`return %1 : tensor<*xf32>`
			`}`
[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00
			`// -----`

			`// Unary MHLO operation.`
			`// CHECK-LABEL: @sqrt`
			`// CHECK-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`func @sqrt(%arg : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])`
			`// CHECK: ^bb0(%[[ARG_:.]]: tensor<xf32>):`
			`// CHECK: %[[TMP0:.*]] = "mhlo.sqrt"(%[[ARG_]])`
			`// CHECK: %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]])`
			`// CHECK: %[[TMP2:.*]] = "mhlo.sqrt"(%[[TMP1]])`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP2]])`
			`// CHECK: return %[[RES]]`
			`%0 = "mhlo.sqrt"(%arg) : (tensor<xf32>) -> tensor<xf32>`
			`%1 = "mhlo.sqrt"(%0) : (tensor<xf32>) -> tensor<xf32>`
			`%2 = "mhlo.sqrt"(%1) : (tensor<xf32>) -> tensor<xf32>`
			`return %2 : tensor<*xf32>`
			`}`

[MLIR][HLO] Add `rank-specialization-to-scf` pass Currently the lowering is only implemented for the unary case. The n-ary case will follow. PiperOrigin-RevId: 374162772 2021-05-17 18:55:32 +08:00			`// CHECK-SCF-LABEL: @sqrt`
			`// CHECK-SCF-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`// CHECK-SCF: %[[SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[N:.*]] = shape.num_elements %[[SHAPE]]`
			`// CHECK-SCF: %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]`
			`// CHECK-SCF: %[[FLAT_ARG:.]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<xf32>, tensor<1xindex>) -> tensor<?xf32>`
			`// CHECK-SCF: %[[TMP0:.*]] = "mhlo.sqrt"(%[[FLAT_ARG]]) : (tensor<?xf32>)`
			`// CHECK-SCF: %[[TMP1:.*]] = "mhlo.sqrt"(%[[TMP0]]) : (tensor<?xf32>)`
[MLIR][HLO] Generalize rank specialization with single operand The pattern can be generalized to also rank specialize operations with a single non-scalar operand. Also extract helper functions that can be reused in following specializations. PiperOrigin-RevId: 374198381 2021-05-17 23:11:59 +08:00			`// CHECK-SCF: %[[UNSHAPED_RES:.*]] = "mhlo.sqrt"(%[[TMP1]]) : (tensor<?xf32>)`
			`// CHECK-SCF: %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[RES:.]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<xf32>`
[MLIR][HLO] Add `rank-specialization-to-scf` pass Currently the lowering is only implemented for the unary case. The n-ary case will follow. PiperOrigin-RevId: 374162772 2021-05-17 18:55:32 +08:00			`// CHECK-SCF: return %[[RES]]`

[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00			`// -----`

[MLIR][HLO] Generalize rank specialization with single operand The pattern can be generalized to also rank specialize operations with a single non-scalar operand. Also extract helper functions that can be reused in following specializations. PiperOrigin-RevId: 374198381 2021-05-17 23:11:59 +08:00			`// Don't cluster ranked operations.`
[MLIR][HLO] Add more tests for `rank-specialization-cluster` pass PiperOrigin-RevId: 373343750 2021-05-12 19:45:06 +08:00			`// CHECK-LABEL: @sqrt_ranked`
			`// CHECK-SAME: (%[[ARG:.*]]: tensor<3x?xf32>)`
			`func @sqrt_ranked(%arg: tensor<3x?xf32>) -> tensor<3x?xf32> {`
			`// CHECK-NOT: rank_specialization_cluster`
			`%0 = "mhlo.sqrt"(%arg) : (tensor<3x?xf32>) -> tensor<3x?xf32>`
			`%1 = "mhlo.sqrt"(%0) : (tensor<3x?xf32>) -> tensor<3x?xf32>`
			`%2 = "mhlo.sqrt"(%1) : (tensor<3x?xf32>) -> tensor<3x?xf32>`
			`return %2 : tensor<3x?xf32>`
			`}`
[MLIR][HLO] Allow rank specialization clustering with `chlo.broadcast_select` op PiperOrigin-RevId: 373379990 2021-05-12 23:55:45 +08:00
			`// -----`

			`// Ternary operation.`
			`// CHECK-LABEL: @select_mixed`
			`// CHECK-SAME: (%[[PRED:.]]: tensor<xi1>, %[[ARG1:.]]: tensor<xf32>, %[[ARG2:.*]]: tensor<2xf32>)`
			`func @select_mixed(%pred: tensor<xi1>, %arg1: tensor<xf32>,`
			`%arg2: tensor<2xf32>) -> tensor<*xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[PRED]], %[[ARG1]], %[[ARG2]])`
			`// CHECK: ^bb0(%[[PRED_:.]]: tensor<xi1>, %[[ARG1_:.]]: tensor<xf32>, %[[ARG2_:.*]]: tensor<2xf32>)`
			`// CHECK: %[[TMP:.*]] = chlo.broadcast_select %[[PRED_]], %[[ARG1_]], %[[ARG2_]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP]])`
			`// CHECK: return %[[RES]]`
			`%0 = "chlo.broadcast_select"(%pred, %arg1, %arg2)`
			`: (tensor<xi1>, tensor<xf32>, tensor<2xf32>) -> tensor<*xf32>`
			`return %0 : tensor<*xf32>`
			`}`
[MLIR][HLO] Support CHLO unary operations in rank specialization clustering PiperOrigin-RevId: 373397321 2021-05-13 01:20:03 +08:00
			`// -----`

			`// Unary CHLO operation.`
			`// CHECK-LABEL: @tan`
[MLIR][HLO] Generalize rank specialization with single operand The pattern can be generalized to also rank specialize operations with a single non-scalar operand. Also extract helper functions that can be reused in following specializations. PiperOrigin-RevId: 374198381 2021-05-17 23:11:59 +08:00			`// CHECK-SAME: (%[[ARG:.]]: tensor<xf32>)`
[MLIR][HLO] Support CHLO unary operations in rank specialization clustering PiperOrigin-RevId: 373397321 2021-05-13 01:20:03 +08:00			`func @tan(%arg : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]]) ( {`
			`// CHECK: ^bb0(%[[ARG_:.]]: tensor<xf32>)`
			`// CHECK: %[[TMP0:.*]] = chlo.tan %[[ARG_]]`
			`// CHECK: %[[TMP1:.*]] = chlo.tan %[[TMP0]]`
			`// CHECK: %[[TMP2:.*]] = chlo.tan %[[TMP1]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP2]])`
			`// CHECK: return %[[RES]]`
			`%0 = chlo.tan %arg : tensor<xf32> -> tensor<xf32>`
			`%1 = chlo.tan %0 : tensor<xf32> -> tensor<xf32>`
			`%2 = chlo.tan %1 : tensor<xf32> -> tensor<xf32>`
			`return %2 : tensor<*xf32>`
			`}`
[MLIR][HLO] Add mixed test for `rank-specialization-cluster` pass PiperOrigin-RevId: 373762814 2021-05-14 19:38:10 +08:00
[MLIR][HLO] Generalize rank specialization with single operand The pattern can be generalized to also rank specialize operations with a single non-scalar operand. Also extract helper functions that can be reused in following specializations. PiperOrigin-RevId: 374198381 2021-05-17 23:11:59 +08:00			`// CHECK-SCF-LABEL: @tan`
			`// CHECK-SCF-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`// CHECK-SCF: %[[SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[N:.*]] = shape.num_elements %[[SHAPE]]`
			`// CHECK-SCF: %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]`
			`// CHECK-SCF: %[[FLAT_ARG:.]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<xf32>, tensor<1xindex>) -> tensor<?xf32>`
			`// CHECK-SCF: %[[TMP0:.*]] = chlo.tan %[[FLAT_ARG]] : tensor<?xf32>`
			`// CHECK-SCF: %[[TMP1:.*]] = chlo.tan %[[TMP0]] : tensor<?xf32>`
			`// CHECK-SCF: %[[UNSHAPED_RES:.*]] = chlo.tan %[[TMP1]] : tensor<?xf32>`
			`// CHECK-SCF: %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[RES:.]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<xf32>`
			`// CHECK-SCF: return %[[RES]]`

[MLIR][HLO] Add mixed test for `rank-specialization-cluster` pass PiperOrigin-RevId: 373762814 2021-05-14 19:38:10 +08:00			`// -----`

			`// Composition of unary/binary CHLO and unary MHLO ops.`
			`// CHECK-LABEL: @mixed`
			`// CHECK-SAME: (%[[ARG0:.]]: tensor<xf32>, %[[ARG1:.]]: tensor<xf32>, %[[ARG2:.]]: tensor<xf32>)`
			`func @mixed(%arg0 : tensor<xf32>, %arg1 : tensor<xf32>, %arg2 : tensor<*xf32>)`
			`-> tensor<*xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG1]], %[[ARG0]])`
			`// CHECK: ^bb0(%[[ARG2_:.]]: tensor<xf32>, %[[ARG1_:.]]: tensor<xf32>, %[[ARG0_:.]]: tensor<xf32>)`
			`// CHECK: %[[TMP0:.*]] = chlo.tan %[[ARG0_]]`
			`// CHECK: %[[TMP1:.*]] = "mhlo.sqrt"(%[[ARG1_]])`
			`// CHECK: %[[TMP2:.*]] = chlo.broadcast_multiply %[[TMP0]], %[[TMP1]]`
			`// CHECK: %[[TMP3:.*]] = chlo.broadcast_add %[[TMP2]], %[[ARG2_]]`
			`// CHECK: %[[TMP4:.*]] = "mhlo.sqrt"(%[[TMP3]])`
			`// CHECK: %[[TMP5:.*]] = chlo.tan %[[TMP4]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP5]])`
			`// CHECK: return %[[RES]]`
			`%0 = chlo.tan %arg0 : tensor<xf32> -> tensor<xf32>`
			`%1 = "mhlo.sqrt"(%arg1) : (tensor<xf32>) -> tensor<xf32>`
			`%2 = chlo.broadcast_multiply %0, %1`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%3 = chlo.broadcast_add %2, %arg2`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%4 = "mhlo.sqrt"(%3) : (tensor<xf32>) -> tensor<xf32>`
			`%5 = chlo.tan %4 : tensor<xf32> -> tensor<xf32>`
			`return %5 : tensor<*xf32>`
			`}`
[MLIR][HLO] Extend rank specialization clustering pass Also cluster operations that operate on same shape operands. These implicitly satisfy the broadcasting semantics requirement. Also, add test cases for some cases that appear in the current MLIR-generated kernels. PiperOrigin-RevId: 374191950 2021-05-17 22:30:45 +08:00
			`// -----`

			`// Constant cluster operand.`
			`// CHECK-LABEL: @relu`
			`// CHECK-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`func @relu(%arg : tensor<xf32>) -> tensor<xf32> {`
			`// CHECK: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]], %[[C0]])`
			`// CHECK: ^bb0(%[[ARG_:.]]: tensor<xf32>, %[[C0_:.*]]: tensor<f32>):`
			`// CHECK: %[[TMP:.*]] = chlo.broadcast_maximum %[[ARG_]], %[[C0_]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP]])`
			`// CHECK: return %[[RES]]`
			`%0 = mhlo.constant dense<0.000000e+00> : tensor<f32>`
			`%1 = chlo.broadcast_maximum %0, %arg`
			`: (tensor<f32>, tensor<xf32>) -> tensor<xf32>`
			`return %1 : tensor<*xf32>`
			`}`

[MLIR][HLO] Generalize rank specialization with single operand The pattern can be generalized to also rank specialize operations with a single non-scalar operand. Also extract helper functions that can be reused in following specializations. PiperOrigin-RevId: 374198381 2021-05-17 23:11:59 +08:00			`// CHECK-SCF-LABEL: @relu`
			`// CHECK-SCF-SAME: (%[[ARG:.]]: tensor<xf32>)`
			`// CHECK-SCF: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>`
			`// CHECK-SCF: %[[SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[N:.*]] = shape.num_elements %[[SHAPE]]`
			`// CHECK-SCF: %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]`
			`// CHECK-SCF: %[[FLAT_ARG:.]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<xf32>, tensor<1xindex>) -> tensor<?xf32>`
			`// CHECK-SCF: %[[UNSHAPED_RES:.*]] = chlo.broadcast_maximum %[[FLAT_ARG]], %[[C0]] : (tensor<?xf32>, tensor<f32>)`
			`// CHECK-SCF: %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[RES:.]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<xf32>`
			`// CHECK-SCF: return %[[RES]]`

[MLIR][HLO] Extend rank specialization clustering pass Also cluster operations that operate on same shape operands. These implicitly satisfy the broadcasting semantics requirement. Also, add test cases for some cases that appear in the current MLIR-generated kernels. PiperOrigin-RevId: 374191950 2021-05-17 22:30:45 +08:00			`// -----`

			`// Cluster with binary non-broadcasting operation.`
			`// CHECK-LABEL: @angle`
			`// CHECK-SAME: (%[[ARG:.]]: tensor<xcomplex<f32>>)`
			`func @angle(%arg : tensor<xcomplex<f32>>) -> tensor<xf32> {`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])`
			`// CHECK: ^bb0(%[[ARG_:.]]: tensor<xcomplex<f32>>):`
			`// CHECK: %[[IMAG:.*]] = "mhlo.imag"(%[[ARG_]])`
			`// CHECK: %[[REAL:.*]] = "mhlo.real"(%[[ARG_]])`
			`// CHECK: %[[TMP:.*]] = mhlo.atan2 %[[IMAG]], %[[REAL]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP]])`
			`// CHECK: return %[[RES]]`
			`%0 = "mhlo.imag"(%arg) : (tensor<xcomplex<f32>>) -> tensor<xf32>`
			`%1 = "mhlo.real"(%arg) : (tensor<xcomplex<f32>>) -> tensor<xf32>`
			`%2 = mhlo.atan2 %0, %1 : tensor<*xf32>`
			`return %2 : tensor<*xf32>`
			`}`

[MLIR][HLO] Generalize rank specialization with single operand The pattern can be generalized to also rank specialize operations with a single non-scalar operand. Also extract helper functions that can be reused in following specializations. PiperOrigin-RevId: 374198381 2021-05-17 23:11:59 +08:00			`// CHECK-SCF-LABEL: @angle`
			`// CHECK-SCF-SAME: (%[[ARG:.]]: tensor<xcomplex<f32>>)`
			`// CHECK-SCF: %[[SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[N:.*]] = shape.num_elements %[[SHAPE]]`
			`// CHECK-SCF: %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]`
			`// CHECK-SCF: %[[FLAT_ARG:.]] = "mhlo.dynamic_reshape"(%[[ARG]], %[[FLAT_SHAPE]]) : (tensor<xcomplex<f32>>, tensor<1xindex>) -> tensor<?xcomplex<f32>>`
			`// CHECK-SCF: %[[IMAG:.*]] = "mhlo.imag"(%[[FLAT_ARG]]) : (tensor<?xcomplex<f32>>)`
			`// CHECK-SCF: %[[REAL:.*]] = "mhlo.real"(%[[FLAT_ARG]]) : (tensor<?xcomplex<f32>>)`
			`// CHECK-SCF: %[[UNSHAPED_RES:.*]] = mhlo.atan2 %[[IMAG]], %[[REAL]] : tensor<?xf32>`
			`// CHECK-SCF: %[[RES_SHAPE:.*]] = shape.shape_of %[[ARG]]`
			`// CHECK-SCF: %[[RES:.]] = "mhlo.dynamic_reshape"(%[[UNSHAPED_RES]], %[[RES_SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<xf32>`
			`// CHECK-SCF: return %[[RES]]`

[MLIR][HLO] Extend rank specialization clustering pass Also cluster operations that operate on same shape operands. These implicitly satisfy the broadcasting semantics requirement. Also, add test cases for some cases that appear in the current MLIR-generated kernels. PiperOrigin-RevId: 374191950 2021-05-17 22:30:45 +08:00			`// -----`

			`// CHECK-LABEL: @xlogy`
			`// CHECK-SAME: (%[[ARG0:.]]: tensor<xf32>, %[[ARG1:.]]: tensor<xf32>)`
			`func @xlogy(%arg0 : tensor<xf32>, %arg1 : tensor<xf32>) -> tensor<*xf32> {`
			`// CHECK: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>`
			`// CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[C0]], %[[ARG0]], %[[ARG1]])`
			`// CHECK: ^bb0(%[[C0_:.]]: tensor<f32>, %[[ARG0_:.]]: tensor<xf32>, %[[ARG1_:.]]: tensor<*xf32>):`
			`// CHECK: %[[TMP0:.*]] = chlo.broadcast_compare %[[ARG0_]], %[[C0_]] {comparison_direction = "EQ"}`
			`// CHECK: %[[TMP1:.*]] = "mhlo.log"(%[[ARG1_]])`
			`// CHECK: %[[TMP2:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[TMP1]]`
			`// CHECK: %[[TMP3:.*]] = chlo.broadcast_select %[[TMP0]], %[[C0_]], %[[TMP2]]`
			`// CHECK: "chlo.rank_specialization_cluster_yield"(%[[TMP3]])`
			`// CHECK: return %[[RES]]`
			`%0 = mhlo.constant dense<0.000000e+00> : tensor<f32>`
			`%1 = tensor.cast %0 : tensor<f32> to tensor<f32>`
			`%2 = chlo.broadcast_compare %arg0, %1 {comparison_direction = "EQ"}`
			`: (tensor<xf32>, tensor<f32>) -> tensor<xi1>`
			`%3 = "mhlo.log"(%arg1) : (tensor<xf32>) -> tensor<xf32>`
			`%4 = chlo.broadcast_multiply %arg0, %3`
			`: (tensor<xf32>, tensor<xf32>) -> tensor<*xf32>`
			`%5 = chlo.broadcast_select %2, %1, %4`
			`: (tensor<xi1>, tensor<f32>, tensor<xf32>) -> tensor<*xf32>`
			`return %5 : tensor<*xf32>`
			`}`