Lower tile to Krnl (#308)

* alloc for unknown shape * determine affine * format * test for unknown input * Update test.py * fix the expression Signed-off-by: chentong <chentong@us.ibm.com> * fix test lit Signed-off-by: chentong <chentong@us.ibm.com> * remove affine load Signed-off-by: chentong <chentong@us.ibm.com> * format Signed-off-by: chentong <chentong@us.ibm.com> * fix test Signed-off-by: chentong <chentong@us.ibm.com> * fix Affineload Signed-off-by: chentong <chentong@us.ibm.com> * affine for alternative Signed-off-by: chentong <chentong@us.ibm.com> * use DimOp Signed-off-by: chentong <chentong@us.ibm.com> * change test case Signed-off-by: chentong <chentong@us.ibm.com> * fix test Signed-off-by: chentong <chentong@us.ibm.com> * use more auto type Signed-off-by: chentong <chentong@us.ibm.com> * fix affine load Signed-off-by: chentong <chentong@us.ibm.com> * small fix Signed-off-by: chentong <chentong@us.ibm.com> Co-authored-by: Tian Jin <tjingrant@gmail.com>
2020-10-05 00:50:59 -04:00 · 2020-10-05 00:50:59 -04:00 · 931127c7e9
parent cb3d1e4f64
commit 931127c7e9
6 changed files with 313 additions and 1 deletions
--- a/src/Conversion/ONNXToKrnl/CMakeLists.txt
+++ b/src/Conversion/ONNXToKrnl/CMakeLists.txt
@ -25,6 +25,7 @@ add_library(OMONNXToKrnl
        Tensor/Split.cpp
        Tensor/Gather.cpp
        Tensor/Size.cpp
        Tensor/Tile.cpp
        ConvertONNXToKrnl.cpp)
 target_link_libraries(OMONNXToKrnl
        onnx)
--- a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
+++ b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
@ -104,6 +104,7 @@ void FrontendToKrnlLoweringPass::runOnOperation() {
  populateLoweringONNXSqueezeOpPattern(patterns, &getContext());
  populateLoweringONNXSplitOpPattern(patterns, &getContext());
  populateLoweringONNXSizeOpPattern(patterns, &getContext());
  populateLoweringONNXTileOpPattern(patterns, &getContext());
  // Neural network
  populateLoweringONNXConvOpPattern(patterns, &getContext());
  populateLoweringONNXNormalizationOpPattern(patterns, &getContext());
--- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
+++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
@ -256,6 +256,9 @@ void populateLoweringONNXSplitOpPattern(
 void populateLoweringONNXSizeOpPattern(
    OwningRewritePatternList &patterns, MLIRContext *ctx);
 void populateLoweringONNXTileOpPattern(
    OwningRewritePatternList &patterns, MLIRContext *ctx);
 bool checkOpResultIsUsedByGetRef(AllocOp *allocOp);
 int64_t getMemRefSizeInBytes(Value val);
--- a/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp
@ -0,0 +1,228 @@
 //===----------------Tile.cpp - Lowering Tile Op----------------------=== //
 //
 // Copyright 2020 The IBM Research Authors.
 //
 // =============================================================================
 //
 // This file lowers the ONNX Tile Operator to Krnl dialect.
 //
 //===----------------------------------------------------------------------===//
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 using namespace mlir;
 //===----------------------------------------------------------------------===//
 // Helper function to insert alloc and dealloc ops for memref of dynamic shape.
 //
 Value insertAllocAndDeallocForTile(MemRefType memRefType, Location loc,
    ConversionPatternRewriter &rewriter, bool insertDealloc, Value inputOperand,
    Value repeatsOperand) {
  AllocOp alloc;
  auto inputShape = inputOperand.getType().cast<MemRefType>().getShape();
  auto inputRank = inputShape.size();
  SmallVector<Value, 4> allocOperands;
  for (int i = 0; i < inputRank; ++i) {
    auto indexVal = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
    SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
    auto repeatsLoadVal =
        rewriter.create<AffineLoadOp>(loc, repeatsOperand, repeatsMemRefVal);
    auto repeatsElementVal = rewriter.create<IndexCastOp>(
        loc, repeatsLoadVal, rewriter.getIndexType());
    auto dimVal = rewriter.create<DimOp>(loc, inputOperand, i);
    Value allocDimVal = rewriter.create<MulIOp>(loc, dimVal, repeatsElementVal);
    allocOperands.emplace_back(allocDimVal);
  }
  alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
  if (insertDealloc) {
    auto *parentBlock = alloc.getOperation()->getBlock();
    auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
    dealloc.getOperation()->moveBefore(&parentBlock->back());
  }
  return alloc;
 }
 struct ONNXTileOpLowering : public ConversionPattern {
  ONNXTileOpLowering(MLIRContext *ctx)
      : ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
      ConversionPatternRewriter &rewriter) const final {
    ONNXTileOpAdaptor operandAdaptor(operands);
    ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
    auto loc = op->getLoc();
    // get input operands, shapes, and rank
    Value input = operandAdaptor.input();
    auto inputMemRefType = input.getType().cast<MemRefType>();
    auto inputShape = inputMemRefType.getShape();
    int64_t inputRank = inputShape.size();
    Value repeats = operandAdaptor.repeats();
    // get output info
    auto resultOperand = tileOp.output();
    auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
    auto outputMemRefShape = outputMemRefType.getShape();
    int64_t outputRank = outputMemRefShape.size();
    bool insertDealloc = checkInsertDealloc(op);
    Value alloc;
    if (hasAllConstantDimensions(outputMemRefType))
      alloc =
          insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
    else
      alloc = insertAllocAndDeallocForTile(
          outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
    // Define loops and iteration trip counts (equivalent to size of output)
    std::vector<Value> originalLoops;
    defineLoops(rewriter, loc, originalLoops, outputRank);
    KrnlIterateOperandPack pack(rewriter, originalLoops);
    for (int ii = 0; ii < outputRank; ++ii)
      addDimensionToPack(rewriter, loc, pack, alloc, ii);
    // Create the loops
    auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
    Block &iterationBlock = iterateOp.bodyRegion().front();
    // Now perform the insertions into the body of the just generated loops.
    // Insert instructions inside the KernelIterateOp body.
    rewriter.setInsertionPointToStart(&iterationBlock);
    // Handle the operations.
    // This implementation is to iterate the output tensor.
    // The store has simple affine subscript expression.
    // Alternative implementation is to iterate the input tensor and repeats.
    // The load of elements in input tensor can be reused explicitly.
    // But the subscript of store is not contigous, or even not affine.
    // Alternative implementation can be found at the end of this file.
    SmallVector<Value, 4> inputMemRefVal;
    for (int i = 0; i < outputRank; ++i) {
      auto indexAE = rewriter.getAffineDimExpr(0);
      auto offsetAE = rewriter.getAffineSymbolExpr(0);
      auto dimMap = AffineMap::get(1, 1, indexAE % offsetAE);
      auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
      auto loopVarVal = iterationBlock.getArguments()[i];
      auto exprVal = rewriter.create<AffineApplyOp>(
          loc, dimMap, ArrayRef<Value>{loopVarVal, inputDimSizeVal});
      inputMemRefVal.emplace_back(exprVal);
    }
    // Load the value from input
    // Tried to use affine load when the input has constant shape
    Value inputVal;
    if (hasAllConstantDimensions(inputMemRefType))
      inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
    else
      inputVal = rewriter.create<LoadOp>(loc, input, inputMemRefVal);
    SmallVector<Value, 4> outputMemRefVal(iterationBlock.getArguments().begin(),
        iterationBlock.getArguments().end());
    // Then store the value in the output.
    rewriter.create<AffineStoreOp>(loc, inputVal, alloc, outputMemRefVal);
    rewriter.replaceOp(op, alloc);
    return success();
  }
 };
 // This is the alternative way of lowering.
 // It is kept here for record in case this implementation is needed
 struct ONNXTileOpLoweringAlternative : public ConversionPattern {
  ONNXTileOpLoweringAlternative(MLIRContext *ctx)
      : ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
      ConversionPatternRewriter &rewriter) const final {
    ONNXTileOpAdaptor operandAdaptor(operands);
    ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
    auto loc = op->getLoc();
    // get input operands, shapes, and rank
    Value input = operandAdaptor.input();
    auto inputShape = input.getType().cast<MemRefType>().getShape();
    int64_t inputRank = inputShape.size();
    Value repeats = operandAdaptor.repeats();
    // get output info
    auto resultOperand = tileOp.output();
    auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
    auto outputMemRefShape = outputMemRefType.getShape();
    int64_t outputRank = outputMemRefShape.size();
    bool insertDealloc = checkInsertDealloc(op);
    Value alloc;
    if (hasAllConstantDimensions(outputMemRefType))
      alloc =
          insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
    else
      alloc = insertAllocAndDeallocForTile(
          outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
    // Define loops and iteration trip counts (equivalent to size of output)
    std::vector<Value> originalLoops;
    defineLoops(rewriter, loc, originalLoops, outputRank * 2);
    KrnlIterateOperandPack pack(rewriter, originalLoops);
    for (int ii = 0; ii < outputRank; ++ii) {
      addDimensionToPack(rewriter, loc, pack, input, ii);
      pack.pushConstantBound(0);
      auto indexVal =
          emitConstantOp(rewriter, loc, rewriter.getIndexType(), ii);
      SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
      auto repeatsLoadVal =
          rewriter.create<AffineLoadOp>(loc, repeats, repeatsMemRefVal);
      auto repeatsElementVal = rewriter.create<IndexCastOp>(
          loc, repeatsLoadVal, rewriter.getIndexType());
      pack.pushOperandBound(repeatsElementVal);
    }
    // Create the loops
    auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
    Block &iterationBlock = iterateOp.bodyRegion().front();
    // Now perform the insertions into the body of the just generated loops.
    // Insert instructions inside the KernelIterateOp body.
    rewriter.setInsertionPointToStart(&iterationBlock);
    // Handle the operations.
    SmallVector<Value, 4> inputMemRefVal;
    for (int j = 0; j < inputRank; ++j) {
      inputMemRefVal.emplace_back(iterationBlock.getArguments()[j * 2]);
    }
    SmallVector<Value, 4> outputMemRefVal;
    for (int i = 0; i < inputRank; ++i) {
      auto inputIndexAE = rewriter.getAffineDimExpr(0);
      auto repeatsIndexAE = rewriter.getAffineDimExpr(1);
      auto inputDimAE = rewriter.getAffineSymbolExpr(0);
      auto dimMap =
          AffineMap::get(2, 1, inputDimAE * repeatsIndexAE + inputIndexAE);
      auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
      auto dimExprVal = rewriter.create<AffineApplyOp>(loc, dimMap,
          ArrayRef<Value>{iterationBlock.getArguments()[2 * i],
              iterationBlock.getArguments()[2 * i + 1], inputDimSizeVal});
      outputMemRefVal.emplace_back(dimExprVal);
    }
    auto inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
    rewriter.create<StoreOp>(loc, inputVal, alloc, outputMemRefVal);
    rewriter.replaceOp(op, alloc);
    return success();
  }
 };
 void populateLoweringONNXTileOpPattern(
    OwningRewritePatternList &patterns, MLIRContext *ctx) {
  patterns.insert<ONNXTileOpLowering>(ctx);
 }
--- a/test/backend/test.py
+++ b/test/backend/test.py
@ -420,6 +420,10 @@ test_to_enable = [
    "test_split_variable_parts_2d_cpu",
    "test_split_variable_parts_default_axis_cpu",
    # Tile
    "test_tile_cpu",
    "test_tile_precomputed_cpu",
    # ConstantOfShape
    "test_constantofshape_float_ones_cpu",
--- a/test/mlir/onnx/onnx_lowering.mlir
+++ b/test/mlir/onnx/onnx_lowering.mlir
@ -2297,3 +2297,78 @@ func @test_constant_of_shape_static_dims() -> tensor<*xf32> {
  // CHECK: }
  // CHECK: return [[RES]] : memref<3x4x5xf32>
 }
 // -----
 // Test Tile with 2D input and constant repeats
 func @test_tile1(%arg0 : tensor<4x8xf32>) -> tensor<*xf32> {
  %0 = "onnx.Constant"() { value = dense<[3, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
  %1 = "onnx.Tile"(%arg0, %0) : (tensor<4x8xf32>, tensor<2xi64>) -> tensor<*xf32>
  return %1 : tensor<*xf32>
  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
  // CHECK-LABEL: test_tile1
  // CHECK:  [[R0:%.+]] = alloc() : memref<12x16xf32>
  // CHECK:  [[R1:%.+]] = "krnl.global"() {name = "constant_0", shape = [2], value = dense<[3, 2]> : tensor<2xi64>} : () -> memref<2xi64>
  // CHECK:  [[R2:%.+]]:2 = krnl.define_loops 2
  // CHECK:  krnl.iterate([[R2]]#0, [[R2]]#1) with ([[R2]]#0 -> [[ARG1:%.+]] = 0 to 12, [[R2]]#1 -> [[ARG2:%.+]] = 0 to 16) {
  // CHECK:    [[C0:%.+]] = constant 0 : index
  // CHECK:    [[R3:%.+]] = dim %arg0, [[C0]] : memref<4x8xf32>
  // CHECK:    [[R4:%.+]] = affine.apply [[INDEX_MAP]]([[ARG1]]){{\[}}[[R3]]{{\]}}
  // CHECK:    [[C1:%.+]] = constant 1 : index
  // CHECK:    [[R5:%.+]] = dim %arg0, [[C1]] : memref<4x8xf32>
  // CHECK:    [[R6:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R5]]{{\]}}
  // CHECK:    [[R7:%.+]] = affine.load %arg0{{\[}}[[R4]], [[R6]]{{\]}} : memref<4x8xf32>
  // CHECK:    affine.store [[R7]], %0{{\[}}[[ARG1]], [[ARG2]]{{\]}} : memref<12x16xf32>
 }
 // -----
 // Test Tile with 1D input and unknown repeats
 func @test_tile2(%arg0 : tensor<8xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
  %1 = "onnx.Tile"(%arg0, %arg1) : (tensor<8xf32>, tensor<1xi64>) -> tensor<*xf32>
  return %1 : tensor<*xf32>
  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
  // CHECK-LABEL test_tile2
  // CHECK:  [[C0:%.+]] = constant 0 : index
  // CHECK:  [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
  // CHECK:  [[R1:%.+]] = index_cast [[R0]] : i64 to index
  // CHECK:  [[C0_0:%.+]] = constant 0 : index
  // CHECK:  [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<8xf32>
  // CHECK:  [[R3:%.+]] = muli [[R2]], [[R1]] : index
  // CHECK:  [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
  // CHECK:  [[R5:%.+]] = krnl.define_loops 1
  // CHECK:  [[C0_1:%.+]] = constant 0 : index
  // CHECK:  [[R6:%.+]] = dim [[R4]], [[C0_1]] : memref<?xf32>
  // CHECK:  krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
  // CHECK:    [[C0_2:%.+]] = constant 0 : index
  // CHECK:    [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<8xf32>
  // CHECK:    [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
  // CHECK:    [[R9:%.+]] = affine.load %arg0{{\[}}[[R8]]{{\]}} : memref<8xf32>
  // CHECK:    affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
 }
 // -----
 // Test Tile with 1D unknown input 
 func @test_tile3(%arg0 : tensor<?xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
  %1 = "onnx.Tile"(%arg0, %arg1) : (tensor<?xf32>, tensor<1xi64>) -> tensor<*xf32>
  return %1 : tensor<*xf32>
  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
  // CHECK-LABEL test_tile3
  // CHECK:  [[C0:%.+]] = constant 0 : index
  // CHECK:  [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
  // CHECK:  [[R1:%.+]] = index_cast [[R0]] : i64 to index
  // CHECK:  [[C0_0:%.+]] = constant 0 : index
  // CHECK:  [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<?xf32>
  // CHECK:  [[R3:%.+]] = muli [[R2]], [[R1]] : index
  // CHECK:  [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
  // CHECK:  [[R5:%.+]] = krnl.define_loops 1
  // CHECK:  [[C0_1:%.+]] = constant 0 : index
  // CHECK:  [[R6:%.+]] = dim %4, [[C0_1]] : memref<?xf32>
  // CHECK:  krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
  // CHECK:    [[C0_2:%.+]] = constant 0 : index
  // CHECK:    [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<?xf32>
  // CHECK:    [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
  // CHECK:    [[R9:%.+]] = load %arg0{{\[}}[[R8]]{{\]}} : memref<?xf32>
  // CHECK:    affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
 }