Lower tile to Krnl (#308)

* alloc for unknown shape

* determine affine

* format

* test for unknown input

* Update test.py

* fix the expression

Signed-off-by: chentong <chentong@us.ibm.com>

* fix test lit

Signed-off-by: chentong <chentong@us.ibm.com>

* remove affine load

Signed-off-by: chentong <chentong@us.ibm.com>

* format

Signed-off-by: chentong <chentong@us.ibm.com>

* fix test

Signed-off-by: chentong <chentong@us.ibm.com>

* fix Affineload

Signed-off-by: chentong <chentong@us.ibm.com>

* affine for alternative

Signed-off-by: chentong <chentong@us.ibm.com>

* use DimOp

Signed-off-by: chentong <chentong@us.ibm.com>

* change test case

Signed-off-by: chentong <chentong@us.ibm.com>

* fix test

Signed-off-by: chentong <chentong@us.ibm.com>

* use more auto type

Signed-off-by: chentong <chentong@us.ibm.com>

* fix affine load

Signed-off-by: chentong <chentong@us.ibm.com>

* small fix

Signed-off-by: chentong <chentong@us.ibm.com>

Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
chentong319 2020-10-05 00:50:59 -04:00 committed by GitHub
parent cb3d1e4f64
commit 931127c7e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 313 additions and 1 deletions

View File

@ -25,6 +25,7 @@ add_library(OMONNXToKrnl
Tensor/Split.cpp Tensor/Split.cpp
Tensor/Gather.cpp Tensor/Gather.cpp
Tensor/Size.cpp Tensor/Size.cpp
Tensor/Tile.cpp
ConvertONNXToKrnl.cpp) ConvertONNXToKrnl.cpp)
target_link_libraries(OMONNXToKrnl target_link_libraries(OMONNXToKrnl
onnx) onnx)

View File

@ -104,6 +104,7 @@ void FrontendToKrnlLoweringPass::runOnOperation() {
populateLoweringONNXSqueezeOpPattern(patterns, &getContext()); populateLoweringONNXSqueezeOpPattern(patterns, &getContext());
populateLoweringONNXSplitOpPattern(patterns, &getContext()); populateLoweringONNXSplitOpPattern(patterns, &getContext());
populateLoweringONNXSizeOpPattern(patterns, &getContext()); populateLoweringONNXSizeOpPattern(patterns, &getContext());
populateLoweringONNXTileOpPattern(patterns, &getContext());
// Neural network // Neural network
populateLoweringONNXConvOpPattern(patterns, &getContext()); populateLoweringONNXConvOpPattern(patterns, &getContext());
populateLoweringONNXNormalizationOpPattern(patterns, &getContext()); populateLoweringONNXNormalizationOpPattern(patterns, &getContext());

View File

@ -256,6 +256,9 @@ void populateLoweringONNXSplitOpPattern(
void populateLoweringONNXSizeOpPattern( void populateLoweringONNXSizeOpPattern(
OwningRewritePatternList &patterns, MLIRContext *ctx); OwningRewritePatternList &patterns, MLIRContext *ctx);
void populateLoweringONNXTileOpPattern(
OwningRewritePatternList &patterns, MLIRContext *ctx);
bool checkOpResultIsUsedByGetRef(AllocOp *allocOp); bool checkOpResultIsUsedByGetRef(AllocOp *allocOp);
int64_t getMemRefSizeInBytes(Value val); int64_t getMemRefSizeInBytes(Value val);

View File

@ -0,0 +1,228 @@
//===----------------Tile.cpp - Lowering Tile Op----------------------=== //
//
// Copyright 2020 The IBM Research Authors.
//
// =============================================================================
//
// This file lowers the ONNX Tile Operator to Krnl dialect.
//
//===----------------------------------------------------------------------===//
#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
using namespace mlir;
//===----------------------------------------------------------------------===//
// Helper function to insert alloc and dealloc ops for memref of dynamic shape.
//
Value insertAllocAndDeallocForTile(MemRefType memRefType, Location loc,
ConversionPatternRewriter &rewriter, bool insertDealloc, Value inputOperand,
Value repeatsOperand) {
AllocOp alloc;
auto inputShape = inputOperand.getType().cast<MemRefType>().getShape();
auto inputRank = inputShape.size();
SmallVector<Value, 4> allocOperands;
for (int i = 0; i < inputRank; ++i) {
auto indexVal = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
auto repeatsLoadVal =
rewriter.create<AffineLoadOp>(loc, repeatsOperand, repeatsMemRefVal);
auto repeatsElementVal = rewriter.create<IndexCastOp>(
loc, repeatsLoadVal, rewriter.getIndexType());
auto dimVal = rewriter.create<DimOp>(loc, inputOperand, i);
Value allocDimVal = rewriter.create<MulIOp>(loc, dimVal, repeatsElementVal);
allocOperands.emplace_back(allocDimVal);
}
alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
if (insertDealloc) {
auto *parentBlock = alloc.getOperation()->getBlock();
auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
dealloc.getOperation()->moveBefore(&parentBlock->back());
}
return alloc;
}
struct ONNXTileOpLowering : public ConversionPattern {
ONNXTileOpLowering(MLIRContext *ctx)
: ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const final {
ONNXTileOpAdaptor operandAdaptor(operands);
ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
auto loc = op->getLoc();
// get input operands, shapes, and rank
Value input = operandAdaptor.input();
auto inputMemRefType = input.getType().cast<MemRefType>();
auto inputShape = inputMemRefType.getShape();
int64_t inputRank = inputShape.size();
Value repeats = operandAdaptor.repeats();
// get output info
auto resultOperand = tileOp.output();
auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
auto outputMemRefShape = outputMemRefType.getShape();
int64_t outputRank = outputMemRefShape.size();
bool insertDealloc = checkInsertDealloc(op);
Value alloc;
if (hasAllConstantDimensions(outputMemRefType))
alloc =
insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
else
alloc = insertAllocAndDeallocForTile(
outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
// Define loops and iteration trip counts (equivalent to size of output)
std::vector<Value> originalLoops;
defineLoops(rewriter, loc, originalLoops, outputRank);
KrnlIterateOperandPack pack(rewriter, originalLoops);
for (int ii = 0; ii < outputRank; ++ii)
addDimensionToPack(rewriter, loc, pack, alloc, ii);
// Create the loops
auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
Block &iterationBlock = iterateOp.bodyRegion().front();
// Now perform the insertions into the body of the just generated loops.
// Insert instructions inside the KernelIterateOp body.
rewriter.setInsertionPointToStart(&iterationBlock);
// Handle the operations.
// This implementation is to iterate the output tensor.
// The store has simple affine subscript expression.
// Alternative implementation is to iterate the input tensor and repeats.
// The load of elements in input tensor can be reused explicitly.
// But the subscript of store is not contigous, or even not affine.
// Alternative implementation can be found at the end of this file.
SmallVector<Value, 4> inputMemRefVal;
for (int i = 0; i < outputRank; ++i) {
auto indexAE = rewriter.getAffineDimExpr(0);
auto offsetAE = rewriter.getAffineSymbolExpr(0);
auto dimMap = AffineMap::get(1, 1, indexAE % offsetAE);
auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
auto loopVarVal = iterationBlock.getArguments()[i];
auto exprVal = rewriter.create<AffineApplyOp>(
loc, dimMap, ArrayRef<Value>{loopVarVal, inputDimSizeVal});
inputMemRefVal.emplace_back(exprVal);
}
// Load the value from input
// Tried to use affine load when the input has constant shape
Value inputVal;
if (hasAllConstantDimensions(inputMemRefType))
inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
else
inputVal = rewriter.create<LoadOp>(loc, input, inputMemRefVal);
SmallVector<Value, 4> outputMemRefVal(iterationBlock.getArguments().begin(),
iterationBlock.getArguments().end());
// Then store the value in the output.
rewriter.create<AffineStoreOp>(loc, inputVal, alloc, outputMemRefVal);
rewriter.replaceOp(op, alloc);
return success();
}
};
// This is the alternative way of lowering.
// It is kept here for record in case this implementation is needed
struct ONNXTileOpLoweringAlternative : public ConversionPattern {
ONNXTileOpLoweringAlternative(MLIRContext *ctx)
: ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const final {
ONNXTileOpAdaptor operandAdaptor(operands);
ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
auto loc = op->getLoc();
// get input operands, shapes, and rank
Value input = operandAdaptor.input();
auto inputShape = input.getType().cast<MemRefType>().getShape();
int64_t inputRank = inputShape.size();
Value repeats = operandAdaptor.repeats();
// get output info
auto resultOperand = tileOp.output();
auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
auto outputMemRefShape = outputMemRefType.getShape();
int64_t outputRank = outputMemRefShape.size();
bool insertDealloc = checkInsertDealloc(op);
Value alloc;
if (hasAllConstantDimensions(outputMemRefType))
alloc =
insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
else
alloc = insertAllocAndDeallocForTile(
outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
// Define loops and iteration trip counts (equivalent to size of output)
std::vector<Value> originalLoops;
defineLoops(rewriter, loc, originalLoops, outputRank * 2);
KrnlIterateOperandPack pack(rewriter, originalLoops);
for (int ii = 0; ii < outputRank; ++ii) {
addDimensionToPack(rewriter, loc, pack, input, ii);
pack.pushConstantBound(0);
auto indexVal =
emitConstantOp(rewriter, loc, rewriter.getIndexType(), ii);
SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
auto repeatsLoadVal =
rewriter.create<AffineLoadOp>(loc, repeats, repeatsMemRefVal);
auto repeatsElementVal = rewriter.create<IndexCastOp>(
loc, repeatsLoadVal, rewriter.getIndexType());
pack.pushOperandBound(repeatsElementVal);
}
// Create the loops
auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
Block &iterationBlock = iterateOp.bodyRegion().front();
// Now perform the insertions into the body of the just generated loops.
// Insert instructions inside the KernelIterateOp body.
rewriter.setInsertionPointToStart(&iterationBlock);
// Handle the operations.
SmallVector<Value, 4> inputMemRefVal;
for (int j = 0; j < inputRank; ++j) {
inputMemRefVal.emplace_back(iterationBlock.getArguments()[j * 2]);
}
SmallVector<Value, 4> outputMemRefVal;
for (int i = 0; i < inputRank; ++i) {
auto inputIndexAE = rewriter.getAffineDimExpr(0);
auto repeatsIndexAE = rewriter.getAffineDimExpr(1);
auto inputDimAE = rewriter.getAffineSymbolExpr(0);
auto dimMap =
AffineMap::get(2, 1, inputDimAE * repeatsIndexAE + inputIndexAE);
auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
auto dimExprVal = rewriter.create<AffineApplyOp>(loc, dimMap,
ArrayRef<Value>{iterationBlock.getArguments()[2 * i],
iterationBlock.getArguments()[2 * i + 1], inputDimSizeVal});
outputMemRefVal.emplace_back(dimExprVal);
}
auto inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
rewriter.create<StoreOp>(loc, inputVal, alloc, outputMemRefVal);
rewriter.replaceOp(op, alloc);
return success();
}
};
void populateLoweringONNXTileOpPattern(
OwningRewritePatternList &patterns, MLIRContext *ctx) {
patterns.insert<ONNXTileOpLowering>(ctx);
}

View File

@ -420,6 +420,10 @@ test_to_enable = [
"test_split_variable_parts_2d_cpu", "test_split_variable_parts_2d_cpu",
"test_split_variable_parts_default_axis_cpu", "test_split_variable_parts_default_axis_cpu",
# Tile
"test_tile_cpu",
"test_tile_precomputed_cpu",
# ConstantOfShape # ConstantOfShape
"test_constantofshape_float_ones_cpu", "test_constantofshape_float_ones_cpu",

View File

@ -2297,3 +2297,78 @@ func @test_constant_of_shape_static_dims() -> tensor<*xf32> {
// CHECK: } // CHECK: }
// CHECK: return [[RES]] : memref<3x4x5xf32> // CHECK: return [[RES]] : memref<3x4x5xf32>
} }
// -----
// Test Tile with 2D input and constant repeats
func @test_tile1(%arg0 : tensor<4x8xf32>) -> tensor<*xf32> {
%0 = "onnx.Constant"() { value = dense<[3, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
%1 = "onnx.Tile"(%arg0, %0) : (tensor<4x8xf32>, tensor<2xi64>) -> tensor<*xf32>
return %1 : tensor<*xf32>
// CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
// CHECK-LABEL: test_tile1
// CHECK: [[R0:%.+]] = alloc() : memref<12x16xf32>
// CHECK: [[R1:%.+]] = "krnl.global"() {name = "constant_0", shape = [2], value = dense<[3, 2]> : tensor<2xi64>} : () -> memref<2xi64>
// CHECK: [[R2:%.+]]:2 = krnl.define_loops 2
// CHECK: krnl.iterate([[R2]]#0, [[R2]]#1) with ([[R2]]#0 -> [[ARG1:%.+]] = 0 to 12, [[R2]]#1 -> [[ARG2:%.+]] = 0 to 16) {
// CHECK: [[C0:%.+]] = constant 0 : index
// CHECK: [[R3:%.+]] = dim %arg0, [[C0]] : memref<4x8xf32>
// CHECK: [[R4:%.+]] = affine.apply [[INDEX_MAP]]([[ARG1]]){{\[}}[[R3]]{{\]}}
// CHECK: [[C1:%.+]] = constant 1 : index
// CHECK: [[R5:%.+]] = dim %arg0, [[C1]] : memref<4x8xf32>
// CHECK: [[R6:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R5]]{{\]}}
// CHECK: [[R7:%.+]] = affine.load %arg0{{\[}}[[R4]], [[R6]]{{\]}} : memref<4x8xf32>
// CHECK: affine.store [[R7]], %0{{\[}}[[ARG1]], [[ARG2]]{{\]}} : memref<12x16xf32>
}
// -----
// Test Tile with 1D input and unknown repeats
func @test_tile2(%arg0 : tensor<8xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
%1 = "onnx.Tile"(%arg0, %arg1) : (tensor<8xf32>, tensor<1xi64>) -> tensor<*xf32>
return %1 : tensor<*xf32>
// CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
// CHECK-LABEL test_tile2
// CHECK: [[C0:%.+]] = constant 0 : index
// CHECK: [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
// CHECK: [[R1:%.+]] = index_cast [[R0]] : i64 to index
// CHECK: [[C0_0:%.+]] = constant 0 : index
// CHECK: [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<8xf32>
// CHECK: [[R3:%.+]] = muli [[R2]], [[R1]] : index
// CHECK: [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
// CHECK: [[R5:%.+]] = krnl.define_loops 1
// CHECK: [[C0_1:%.+]] = constant 0 : index
// CHECK: [[R6:%.+]] = dim [[R4]], [[C0_1]] : memref<?xf32>
// CHECK: krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
// CHECK: [[C0_2:%.+]] = constant 0 : index
// CHECK: [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<8xf32>
// CHECK: [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
// CHECK: [[R9:%.+]] = affine.load %arg0{{\[}}[[R8]]{{\]}} : memref<8xf32>
// CHECK: affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
}
// -----
// Test Tile with 1D unknown input
func @test_tile3(%arg0 : tensor<?xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
%1 = "onnx.Tile"(%arg0, %arg1) : (tensor<?xf32>, tensor<1xi64>) -> tensor<*xf32>
return %1 : tensor<*xf32>
// CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
// CHECK-LABEL test_tile3
// CHECK: [[C0:%.+]] = constant 0 : index
// CHECK: [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
// CHECK: [[R1:%.+]] = index_cast [[R0]] : i64 to index
// CHECK: [[C0_0:%.+]] = constant 0 : index
// CHECK: [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<?xf32>
// CHECK: [[R3:%.+]] = muli [[R2]], [[R1]] : index
// CHECK: [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
// CHECK: [[R5:%.+]] = krnl.define_loops 1
// CHECK: [[C0_1:%.+]] = constant 0 : index
// CHECK: [[R6:%.+]] = dim %4, [[C0_1]] : memref<?xf32>
// CHECK: krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
// CHECK: [[C0_2:%.+]] = constant 0 : index
// CHECK: [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<?xf32>
// CHECK: [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
// CHECK: [[R9:%.+]] = load %arg0{{\[}}[[R8]]{{\]}} : memref<?xf32>
// CHECK: affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
}