diff --git a/src/Conversion/ONNXToKrnl/CMakeLists.txt b/src/Conversion/ONNXToKrnl/CMakeLists.txt index bab2dd3..6b6f799 100644 --- a/src/Conversion/ONNXToKrnl/CMakeLists.txt +++ b/src/Conversion/ONNXToKrnl/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(OMONNXToKrnl Tensor/Split.cpp Tensor/Gather.cpp Tensor/Size.cpp + Tensor/Tile.cpp ConvertONNXToKrnl.cpp) target_link_libraries(OMONNXToKrnl onnx) diff --git a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp index a559c40..de4f9c5 100644 --- a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp +++ b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp @@ -104,6 +104,7 @@ void FrontendToKrnlLoweringPass::runOnOperation() { populateLoweringONNXSqueezeOpPattern(patterns, &getContext()); populateLoweringONNXSplitOpPattern(patterns, &getContext()); populateLoweringONNXSizeOpPattern(patterns, &getContext()); + populateLoweringONNXTileOpPattern(patterns, &getContext()); // Neural network populateLoweringONNXConvOpPattern(patterns, &getContext()); populateLoweringONNXNormalizationOpPattern(patterns, &getContext()); diff --git a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp index aa3d848..6045abb 100644 --- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp +++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp @@ -256,6 +256,9 @@ void populateLoweringONNXSplitOpPattern( void populateLoweringONNXSizeOpPattern( OwningRewritePatternList &patterns, MLIRContext *ctx); +void populateLoweringONNXTileOpPattern( + OwningRewritePatternList &patterns, MLIRContext *ctx); + bool checkOpResultIsUsedByGetRef(AllocOp *allocOp); int64_t getMemRefSizeInBytes(Value val); diff --git a/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp b/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp new file mode 100644 index 0000000..b2883f2 --- /dev/null +++ b/src/Conversion/ONNXToKrnl/Tensor/Tile.cpp @@ -0,0 +1,228 @@ +//===----------------Tile.cpp - Lowering Tile Op----------------------=== // +// +// Copyright 2020 The IBM Research Authors. +// +// ============================================================================= +// +// This file lowers the ONNX Tile Operator to Krnl dialect. +// +//===----------------------------------------------------------------------===// + +#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp" + +using namespace mlir; + +//===----------------------------------------------------------------------===// +// Helper function to insert alloc and dealloc ops for memref of dynamic shape. +// + +Value insertAllocAndDeallocForTile(MemRefType memRefType, Location loc, + ConversionPatternRewriter &rewriter, bool insertDealloc, Value inputOperand, + Value repeatsOperand) { + AllocOp alloc; + auto inputShape = inputOperand.getType().cast().getShape(); + auto inputRank = inputShape.size(); + + SmallVector allocOperands; + for (int i = 0; i < inputRank; ++i) { + auto indexVal = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i); + SmallVector repeatsMemRefVal = {indexVal}; + auto repeatsLoadVal = + rewriter.create(loc, repeatsOperand, repeatsMemRefVal); + auto repeatsElementVal = rewriter.create( + loc, repeatsLoadVal, rewriter.getIndexType()); + auto dimVal = rewriter.create(loc, inputOperand, i); + Value allocDimVal = rewriter.create(loc, dimVal, repeatsElementVal); + allocOperands.emplace_back(allocDimVal); + } + alloc = rewriter.create(loc, memRefType, allocOperands); + if (insertDealloc) { + auto *parentBlock = alloc.getOperation()->getBlock(); + auto dealloc = rewriter.create(loc, alloc); + dealloc.getOperation()->moveBefore(&parentBlock->back()); + } + return alloc; +} + +struct ONNXTileOpLowering : public ConversionPattern { + ONNXTileOpLowering(MLIRContext *ctx) + : ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {} + + LogicalResult matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + ONNXTileOpAdaptor operandAdaptor(operands); + ONNXTileOp tileOp = llvm::cast(op); + auto loc = op->getLoc(); + + // get input operands, shapes, and rank + Value input = operandAdaptor.input(); + auto inputMemRefType = input.getType().cast(); + auto inputShape = inputMemRefType.getShape(); + int64_t inputRank = inputShape.size(); + Value repeats = operandAdaptor.repeats(); + + // get output info + auto resultOperand = tileOp.output(); + auto outputMemRefType = convertToMemRefType(*op->result_type_begin()); + auto outputMemRefShape = outputMemRefType.getShape(); + int64_t outputRank = outputMemRefShape.size(); + + bool insertDealloc = checkInsertDealloc(op); + Value alloc; + if (hasAllConstantDimensions(outputMemRefType)) + alloc = + insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc); + else + alloc = insertAllocAndDeallocForTile( + outputMemRefType, loc, rewriter, insertDealloc, input, repeats); + + // Define loops and iteration trip counts (equivalent to size of output) + std::vector originalLoops; + defineLoops(rewriter, loc, originalLoops, outputRank); + KrnlIterateOperandPack pack(rewriter, originalLoops); + for (int ii = 0; ii < outputRank; ++ii) + addDimensionToPack(rewriter, loc, pack, alloc, ii); + + // Create the loops + auto iterateOp = rewriter.create(loc, pack); + Block &iterationBlock = iterateOp.bodyRegion().front(); + + // Now perform the insertions into the body of the just generated loops. + // Insert instructions inside the KernelIterateOp body. + rewriter.setInsertionPointToStart(&iterationBlock); + + // Handle the operations. + + // This implementation is to iterate the output tensor. + // The store has simple affine subscript expression. + // Alternative implementation is to iterate the input tensor and repeats. + // The load of elements in input tensor can be reused explicitly. + // But the subscript of store is not contigous, or even not affine. + // Alternative implementation can be found at the end of this file. + SmallVector inputMemRefVal; + for (int i = 0; i < outputRank; ++i) { + auto indexAE = rewriter.getAffineDimExpr(0); + auto offsetAE = rewriter.getAffineSymbolExpr(0); + auto dimMap = AffineMap::get(1, 1, indexAE % offsetAE); + + auto inputDimSizeVal = rewriter.create(loc, input, i); + auto loopVarVal = iterationBlock.getArguments()[i]; + auto exprVal = rewriter.create( + loc, dimMap, ArrayRef{loopVarVal, inputDimSizeVal}); + inputMemRefVal.emplace_back(exprVal); + } + + // Load the value from input + // Tried to use affine load when the input has constant shape + Value inputVal; + if (hasAllConstantDimensions(inputMemRefType)) + inputVal = rewriter.create(loc, input, inputMemRefVal); + else + inputVal = rewriter.create(loc, input, inputMemRefVal); + SmallVector outputMemRefVal(iterationBlock.getArguments().begin(), + iterationBlock.getArguments().end()); + + // Then store the value in the output. + rewriter.create(loc, inputVal, alloc, outputMemRefVal); + + rewriter.replaceOp(op, alloc); + + return success(); + } +}; + +// This is the alternative way of lowering. +// It is kept here for record in case this implementation is needed +struct ONNXTileOpLoweringAlternative : public ConversionPattern { + ONNXTileOpLoweringAlternative(MLIRContext *ctx) + : ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {} + + LogicalResult matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + ONNXTileOpAdaptor operandAdaptor(operands); + ONNXTileOp tileOp = llvm::cast(op); + auto loc = op->getLoc(); + // get input operands, shapes, and rank + Value input = operandAdaptor.input(); + auto inputShape = input.getType().cast().getShape(); + int64_t inputRank = inputShape.size(); + Value repeats = operandAdaptor.repeats(); + + // get output info + auto resultOperand = tileOp.output(); + auto outputMemRefType = convertToMemRefType(*op->result_type_begin()); + auto outputMemRefShape = outputMemRefType.getShape(); + int64_t outputRank = outputMemRefShape.size(); + + bool insertDealloc = checkInsertDealloc(op); + Value alloc; + if (hasAllConstantDimensions(outputMemRefType)) + alloc = + insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc); + else + alloc = insertAllocAndDeallocForTile( + outputMemRefType, loc, rewriter, insertDealloc, input, repeats); + + // Define loops and iteration trip counts (equivalent to size of output) + std::vector originalLoops; + defineLoops(rewriter, loc, originalLoops, outputRank * 2); + KrnlIterateOperandPack pack(rewriter, originalLoops); + for (int ii = 0; ii < outputRank; ++ii) { + addDimensionToPack(rewriter, loc, pack, input, ii); + pack.pushConstantBound(0); + auto indexVal = + emitConstantOp(rewriter, loc, rewriter.getIndexType(), ii); + SmallVector repeatsMemRefVal = {indexVal}; + auto repeatsLoadVal = + rewriter.create(loc, repeats, repeatsMemRefVal); + auto repeatsElementVal = rewriter.create( + loc, repeatsLoadVal, rewriter.getIndexType()); + pack.pushOperandBound(repeatsElementVal); + } + + // Create the loops + auto iterateOp = rewriter.create(loc, pack); + Block &iterationBlock = iterateOp.bodyRegion().front(); + + // Now perform the insertions into the body of the just generated loops. + // Insert instructions inside the KernelIterateOp body. + rewriter.setInsertionPointToStart(&iterationBlock); + + // Handle the operations. + + SmallVector inputMemRefVal; + for (int j = 0; j < inputRank; ++j) { + inputMemRefVal.emplace_back(iterationBlock.getArguments()[j * 2]); + } + + SmallVector outputMemRefVal; + for (int i = 0; i < inputRank; ++i) { + + auto inputIndexAE = rewriter.getAffineDimExpr(0); + auto repeatsIndexAE = rewriter.getAffineDimExpr(1); + auto inputDimAE = rewriter.getAffineSymbolExpr(0); + + auto dimMap = + AffineMap::get(2, 1, inputDimAE * repeatsIndexAE + inputIndexAE); + + auto inputDimSizeVal = rewriter.create(loc, input, i); + + auto dimExprVal = rewriter.create(loc, dimMap, + ArrayRef{iterationBlock.getArguments()[2 * i], + iterationBlock.getArguments()[2 * i + 1], inputDimSizeVal}); + outputMemRefVal.emplace_back(dimExprVal); + } + + auto inputVal = rewriter.create(loc, input, inputMemRefVal); + rewriter.create(loc, inputVal, alloc, outputMemRefVal); + + rewriter.replaceOp(op, alloc); + + return success(); + } +}; + +void populateLoweringONNXTileOpPattern( + OwningRewritePatternList &patterns, MLIRContext *ctx) { + patterns.insert(ctx); +} diff --git a/test/backend/test.py b/test/backend/test.py index 817e788..51a9224 100644 --- a/test/backend/test.py +++ b/test/backend/test.py @@ -419,7 +419,11 @@ test_to_enable = [ "test_split_variable_parts_1d_cpu", "test_split_variable_parts_2d_cpu", "test_split_variable_parts_default_axis_cpu", - + + # Tile + "test_tile_cpu", + "test_tile_precomputed_cpu", + # ConstantOfShape "test_constantofshape_float_ones_cpu", diff --git a/test/mlir/onnx/onnx_lowering.mlir b/test/mlir/onnx/onnx_lowering.mlir index de3511d..6b437a5 100644 --- a/test/mlir/onnx/onnx_lowering.mlir +++ b/test/mlir/onnx/onnx_lowering.mlir @@ -2297,3 +2297,78 @@ func @test_constant_of_shape_static_dims() -> tensor<*xf32> { // CHECK: } // CHECK: return [[RES]] : memref<3x4x5xf32> } + +// ----- + +// Test Tile with 2D input and constant repeats +func @test_tile1(%arg0 : tensor<4x8xf32>) -> tensor<*xf32> { + %0 = "onnx.Constant"() { value = dense<[3, 2]> : tensor<2xi64>} : () -> tensor<2xi64> + %1 = "onnx.Tile"(%arg0, %0) : (tensor<4x8xf32>, tensor<2xi64>) -> tensor<*xf32> + return %1 : tensor<*xf32> + // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)> + // CHECK-LABEL: test_tile1 + // CHECK: [[R0:%.+]] = alloc() : memref<12x16xf32> + // CHECK: [[R1:%.+]] = "krnl.global"() {name = "constant_0", shape = [2], value = dense<[3, 2]> : tensor<2xi64>} : () -> memref<2xi64> + // CHECK: [[R2:%.+]]:2 = krnl.define_loops 2 + // CHECK: krnl.iterate([[R2]]#0, [[R2]]#1) with ([[R2]]#0 -> [[ARG1:%.+]] = 0 to 12, [[R2]]#1 -> [[ARG2:%.+]] = 0 to 16) { + // CHECK: [[C0:%.+]] = constant 0 : index + // CHECK: [[R3:%.+]] = dim %arg0, [[C0]] : memref<4x8xf32> + // CHECK: [[R4:%.+]] = affine.apply [[INDEX_MAP]]([[ARG1]]){{\[}}[[R3]]{{\]}} + // CHECK: [[C1:%.+]] = constant 1 : index + // CHECK: [[R5:%.+]] = dim %arg0, [[C1]] : memref<4x8xf32> + // CHECK: [[R6:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R5]]{{\]}} + // CHECK: [[R7:%.+]] = affine.load %arg0{{\[}}[[R4]], [[R6]]{{\]}} : memref<4x8xf32> + // CHECK: affine.store [[R7]], %0{{\[}}[[ARG1]], [[ARG2]]{{\]}} : memref<12x16xf32> +} + +// ----- + +// Test Tile with 1D input and unknown repeats +func @test_tile2(%arg0 : tensor<8xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> { + %1 = "onnx.Tile"(%arg0, %arg1) : (tensor<8xf32>, tensor<1xi64>) -> tensor<*xf32> + return %1 : tensor<*xf32> + // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)> + // CHECK-LABEL test_tile2 + // CHECK: [[C0:%.+]] = constant 0 : index + // CHECK: [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64> + // CHECK: [[R1:%.+]] = index_cast [[R0]] : i64 to index + // CHECK: [[C0_0:%.+]] = constant 0 : index + // CHECK: [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<8xf32> + // CHECK: [[R3:%.+]] = muli [[R2]], [[R1]] : index + // CHECK: [[R4:%.+]] = alloc([[R3]]) : memref + // CHECK: [[R5:%.+]] = krnl.define_loops 1 + // CHECK: [[C0_1:%.+]] = constant 0 : index + // CHECK: [[R6:%.+]] = dim [[R4]], [[C0_1]] : memref + // CHECK: krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) { + // CHECK: [[C0_2:%.+]] = constant 0 : index + // CHECK: [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<8xf32> + // CHECK: [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}} + // CHECK: [[R9:%.+]] = affine.load %arg0{{\[}}[[R8]]{{\]}} : memref<8xf32> + // CHECK: affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref +} + +// ----- + +// Test Tile with 1D unknown input +func @test_tile3(%arg0 : tensor, %arg1 : tensor<1xi64>) -> tensor<*xf32> { + %1 = "onnx.Tile"(%arg0, %arg1) : (tensor, tensor<1xi64>) -> tensor<*xf32> + return %1 : tensor<*xf32> + // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)> + // CHECK-LABEL test_tile3 + // CHECK: [[C0:%.+]] = constant 0 : index + // CHECK: [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64> + // CHECK: [[R1:%.+]] = index_cast [[R0]] : i64 to index + // CHECK: [[C0_0:%.+]] = constant 0 : index + // CHECK: [[R2:%.+]] = dim %arg0, [[C0_0]] : memref + // CHECK: [[R3:%.+]] = muli [[R2]], [[R1]] : index + // CHECK: [[R4:%.+]] = alloc([[R3]]) : memref + // CHECK: [[R5:%.+]] = krnl.define_loops 1 + // CHECK: [[C0_1:%.+]] = constant 0 : index + // CHECK: [[R6:%.+]] = dim %4, [[C0_1]] : memref + // CHECK: krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) { + // CHECK: [[C0_2:%.+]] = constant 0 : index + // CHECK: [[R7:%.+]] = dim %arg0, [[C0_2]] : memref + // CHECK: [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}} + // CHECK: [[R9:%.+]] = load %arg0{{\[}}[[R8]]{{\]}} : memref + // CHECK: affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref +}