Lower tile to Krnl (#308)
* alloc for unknown shape * determine affine * format * test for unknown input * Update test.py * fix the expression Signed-off-by: chentong <chentong@us.ibm.com> * fix test lit Signed-off-by: chentong <chentong@us.ibm.com> * remove affine load Signed-off-by: chentong <chentong@us.ibm.com> * format Signed-off-by: chentong <chentong@us.ibm.com> * fix test Signed-off-by: chentong <chentong@us.ibm.com> * fix Affineload Signed-off-by: chentong <chentong@us.ibm.com> * affine for alternative Signed-off-by: chentong <chentong@us.ibm.com> * use DimOp Signed-off-by: chentong <chentong@us.ibm.com> * change test case Signed-off-by: chentong <chentong@us.ibm.com> * fix test Signed-off-by: chentong <chentong@us.ibm.com> * use more auto type Signed-off-by: chentong <chentong@us.ibm.com> * fix affine load Signed-off-by: chentong <chentong@us.ibm.com> * small fix Signed-off-by: chentong <chentong@us.ibm.com> Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
parent
cb3d1e4f64
commit
931127c7e9
|
@ -25,6 +25,7 @@ add_library(OMONNXToKrnl
|
|||
Tensor/Split.cpp
|
||||
Tensor/Gather.cpp
|
||||
Tensor/Size.cpp
|
||||
Tensor/Tile.cpp
|
||||
ConvertONNXToKrnl.cpp)
|
||||
target_link_libraries(OMONNXToKrnl
|
||||
onnx)
|
||||
|
|
|
@ -104,6 +104,7 @@ void FrontendToKrnlLoweringPass::runOnOperation() {
|
|||
populateLoweringONNXSqueezeOpPattern(patterns, &getContext());
|
||||
populateLoweringONNXSplitOpPattern(patterns, &getContext());
|
||||
populateLoweringONNXSizeOpPattern(patterns, &getContext());
|
||||
populateLoweringONNXTileOpPattern(patterns, &getContext());
|
||||
// Neural network
|
||||
populateLoweringONNXConvOpPattern(patterns, &getContext());
|
||||
populateLoweringONNXNormalizationOpPattern(patterns, &getContext());
|
||||
|
|
|
@ -256,6 +256,9 @@ void populateLoweringONNXSplitOpPattern(
|
|||
void populateLoweringONNXSizeOpPattern(
|
||||
OwningRewritePatternList &patterns, MLIRContext *ctx);
|
||||
|
||||
void populateLoweringONNXTileOpPattern(
|
||||
OwningRewritePatternList &patterns, MLIRContext *ctx);
|
||||
|
||||
bool checkOpResultIsUsedByGetRef(AllocOp *allocOp);
|
||||
|
||||
int64_t getMemRefSizeInBytes(Value val);
|
||||
|
|
|
@ -0,0 +1,228 @@
|
|||
//===----------------Tile.cpp - Lowering Tile Op----------------------=== //
|
||||
//
|
||||
// Copyright 2020 The IBM Research Authors.
|
||||
//
|
||||
// =============================================================================
|
||||
//
|
||||
// This file lowers the ONNX Tile Operator to Krnl dialect.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Helper function to insert alloc and dealloc ops for memref of dynamic shape.
|
||||
//
|
||||
|
||||
Value insertAllocAndDeallocForTile(MemRefType memRefType, Location loc,
|
||||
ConversionPatternRewriter &rewriter, bool insertDealloc, Value inputOperand,
|
||||
Value repeatsOperand) {
|
||||
AllocOp alloc;
|
||||
auto inputShape = inputOperand.getType().cast<MemRefType>().getShape();
|
||||
auto inputRank = inputShape.size();
|
||||
|
||||
SmallVector<Value, 4> allocOperands;
|
||||
for (int i = 0; i < inputRank; ++i) {
|
||||
auto indexVal = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
|
||||
SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
|
||||
auto repeatsLoadVal =
|
||||
rewriter.create<AffineLoadOp>(loc, repeatsOperand, repeatsMemRefVal);
|
||||
auto repeatsElementVal = rewriter.create<IndexCastOp>(
|
||||
loc, repeatsLoadVal, rewriter.getIndexType());
|
||||
auto dimVal = rewriter.create<DimOp>(loc, inputOperand, i);
|
||||
Value allocDimVal = rewriter.create<MulIOp>(loc, dimVal, repeatsElementVal);
|
||||
allocOperands.emplace_back(allocDimVal);
|
||||
}
|
||||
alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
|
||||
if (insertDealloc) {
|
||||
auto *parentBlock = alloc.getOperation()->getBlock();
|
||||
auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
|
||||
dealloc.getOperation()->moveBefore(&parentBlock->back());
|
||||
}
|
||||
return alloc;
|
||||
}
|
||||
|
||||
struct ONNXTileOpLowering : public ConversionPattern {
|
||||
ONNXTileOpLowering(MLIRContext *ctx)
|
||||
: ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
ONNXTileOpAdaptor operandAdaptor(operands);
|
||||
ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
|
||||
auto loc = op->getLoc();
|
||||
|
||||
// get input operands, shapes, and rank
|
||||
Value input = operandAdaptor.input();
|
||||
auto inputMemRefType = input.getType().cast<MemRefType>();
|
||||
auto inputShape = inputMemRefType.getShape();
|
||||
int64_t inputRank = inputShape.size();
|
||||
Value repeats = operandAdaptor.repeats();
|
||||
|
||||
// get output info
|
||||
auto resultOperand = tileOp.output();
|
||||
auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
|
||||
auto outputMemRefShape = outputMemRefType.getShape();
|
||||
int64_t outputRank = outputMemRefShape.size();
|
||||
|
||||
bool insertDealloc = checkInsertDealloc(op);
|
||||
Value alloc;
|
||||
if (hasAllConstantDimensions(outputMemRefType))
|
||||
alloc =
|
||||
insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
|
||||
else
|
||||
alloc = insertAllocAndDeallocForTile(
|
||||
outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
|
||||
|
||||
// Define loops and iteration trip counts (equivalent to size of output)
|
||||
std::vector<Value> originalLoops;
|
||||
defineLoops(rewriter, loc, originalLoops, outputRank);
|
||||
KrnlIterateOperandPack pack(rewriter, originalLoops);
|
||||
for (int ii = 0; ii < outputRank; ++ii)
|
||||
addDimensionToPack(rewriter, loc, pack, alloc, ii);
|
||||
|
||||
// Create the loops
|
||||
auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
|
||||
Block &iterationBlock = iterateOp.bodyRegion().front();
|
||||
|
||||
// Now perform the insertions into the body of the just generated loops.
|
||||
// Insert instructions inside the KernelIterateOp body.
|
||||
rewriter.setInsertionPointToStart(&iterationBlock);
|
||||
|
||||
// Handle the operations.
|
||||
|
||||
// This implementation is to iterate the output tensor.
|
||||
// The store has simple affine subscript expression.
|
||||
// Alternative implementation is to iterate the input tensor and repeats.
|
||||
// The load of elements in input tensor can be reused explicitly.
|
||||
// But the subscript of store is not contigous, or even not affine.
|
||||
// Alternative implementation can be found at the end of this file.
|
||||
SmallVector<Value, 4> inputMemRefVal;
|
||||
for (int i = 0; i < outputRank; ++i) {
|
||||
auto indexAE = rewriter.getAffineDimExpr(0);
|
||||
auto offsetAE = rewriter.getAffineSymbolExpr(0);
|
||||
auto dimMap = AffineMap::get(1, 1, indexAE % offsetAE);
|
||||
|
||||
auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
|
||||
auto loopVarVal = iterationBlock.getArguments()[i];
|
||||
auto exprVal = rewriter.create<AffineApplyOp>(
|
||||
loc, dimMap, ArrayRef<Value>{loopVarVal, inputDimSizeVal});
|
||||
inputMemRefVal.emplace_back(exprVal);
|
||||
}
|
||||
|
||||
// Load the value from input
|
||||
// Tried to use affine load when the input has constant shape
|
||||
Value inputVal;
|
||||
if (hasAllConstantDimensions(inputMemRefType))
|
||||
inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
|
||||
else
|
||||
inputVal = rewriter.create<LoadOp>(loc, input, inputMemRefVal);
|
||||
SmallVector<Value, 4> outputMemRefVal(iterationBlock.getArguments().begin(),
|
||||
iterationBlock.getArguments().end());
|
||||
|
||||
// Then store the value in the output.
|
||||
rewriter.create<AffineStoreOp>(loc, inputVal, alloc, outputMemRefVal);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
// This is the alternative way of lowering.
|
||||
// It is kept here for record in case this implementation is needed
|
||||
struct ONNXTileOpLoweringAlternative : public ConversionPattern {
|
||||
ONNXTileOpLoweringAlternative(MLIRContext *ctx)
|
||||
: ConversionPattern(mlir::ONNXTileOp::getOperationName(), 1, ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
ONNXTileOpAdaptor operandAdaptor(operands);
|
||||
ONNXTileOp tileOp = llvm::cast<ONNXTileOp>(op);
|
||||
auto loc = op->getLoc();
|
||||
// get input operands, shapes, and rank
|
||||
Value input = operandAdaptor.input();
|
||||
auto inputShape = input.getType().cast<MemRefType>().getShape();
|
||||
int64_t inputRank = inputShape.size();
|
||||
Value repeats = operandAdaptor.repeats();
|
||||
|
||||
// get output info
|
||||
auto resultOperand = tileOp.output();
|
||||
auto outputMemRefType = convertToMemRefType(*op->result_type_begin());
|
||||
auto outputMemRefShape = outputMemRefType.getShape();
|
||||
int64_t outputRank = outputMemRefShape.size();
|
||||
|
||||
bool insertDealloc = checkInsertDealloc(op);
|
||||
Value alloc;
|
||||
if (hasAllConstantDimensions(outputMemRefType))
|
||||
alloc =
|
||||
insertAllocAndDealloc(outputMemRefType, loc, rewriter, insertDealloc);
|
||||
else
|
||||
alloc = insertAllocAndDeallocForTile(
|
||||
outputMemRefType, loc, rewriter, insertDealloc, input, repeats);
|
||||
|
||||
// Define loops and iteration trip counts (equivalent to size of output)
|
||||
std::vector<Value> originalLoops;
|
||||
defineLoops(rewriter, loc, originalLoops, outputRank * 2);
|
||||
KrnlIterateOperandPack pack(rewriter, originalLoops);
|
||||
for (int ii = 0; ii < outputRank; ++ii) {
|
||||
addDimensionToPack(rewriter, loc, pack, input, ii);
|
||||
pack.pushConstantBound(0);
|
||||
auto indexVal =
|
||||
emitConstantOp(rewriter, loc, rewriter.getIndexType(), ii);
|
||||
SmallVector<Value, 1> repeatsMemRefVal = {indexVal};
|
||||
auto repeatsLoadVal =
|
||||
rewriter.create<AffineLoadOp>(loc, repeats, repeatsMemRefVal);
|
||||
auto repeatsElementVal = rewriter.create<IndexCastOp>(
|
||||
loc, repeatsLoadVal, rewriter.getIndexType());
|
||||
pack.pushOperandBound(repeatsElementVal);
|
||||
}
|
||||
|
||||
// Create the loops
|
||||
auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
|
||||
Block &iterationBlock = iterateOp.bodyRegion().front();
|
||||
|
||||
// Now perform the insertions into the body of the just generated loops.
|
||||
// Insert instructions inside the KernelIterateOp body.
|
||||
rewriter.setInsertionPointToStart(&iterationBlock);
|
||||
|
||||
// Handle the operations.
|
||||
|
||||
SmallVector<Value, 4> inputMemRefVal;
|
||||
for (int j = 0; j < inputRank; ++j) {
|
||||
inputMemRefVal.emplace_back(iterationBlock.getArguments()[j * 2]);
|
||||
}
|
||||
|
||||
SmallVector<Value, 4> outputMemRefVal;
|
||||
for (int i = 0; i < inputRank; ++i) {
|
||||
|
||||
auto inputIndexAE = rewriter.getAffineDimExpr(0);
|
||||
auto repeatsIndexAE = rewriter.getAffineDimExpr(1);
|
||||
auto inputDimAE = rewriter.getAffineSymbolExpr(0);
|
||||
|
||||
auto dimMap =
|
||||
AffineMap::get(2, 1, inputDimAE * repeatsIndexAE + inputIndexAE);
|
||||
|
||||
auto inputDimSizeVal = rewriter.create<DimOp>(loc, input, i);
|
||||
|
||||
auto dimExprVal = rewriter.create<AffineApplyOp>(loc, dimMap,
|
||||
ArrayRef<Value>{iterationBlock.getArguments()[2 * i],
|
||||
iterationBlock.getArguments()[2 * i + 1], inputDimSizeVal});
|
||||
outputMemRefVal.emplace_back(dimExprVal);
|
||||
}
|
||||
|
||||
auto inputVal = rewriter.create<AffineLoadOp>(loc, input, inputMemRefVal);
|
||||
rewriter.create<StoreOp>(loc, inputVal, alloc, outputMemRefVal);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateLoweringONNXTileOpPattern(
|
||||
OwningRewritePatternList &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ONNXTileOpLowering>(ctx);
|
||||
}
|
|
@ -420,6 +420,10 @@ test_to_enable = [
|
|||
"test_split_variable_parts_2d_cpu",
|
||||
"test_split_variable_parts_default_axis_cpu",
|
||||
|
||||
# Tile
|
||||
"test_tile_cpu",
|
||||
"test_tile_precomputed_cpu",
|
||||
|
||||
# ConstantOfShape
|
||||
"test_constantofshape_float_ones_cpu",
|
||||
|
||||
|
|
|
@ -2297,3 +2297,78 @@ func @test_constant_of_shape_static_dims() -> tensor<*xf32> {
|
|||
// CHECK: }
|
||||
// CHECK: return [[RES]] : memref<3x4x5xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// Test Tile with 2D input and constant repeats
|
||||
func @test_tile1(%arg0 : tensor<4x8xf32>) -> tensor<*xf32> {
|
||||
%0 = "onnx.Constant"() { value = dense<[3, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
|
||||
%1 = "onnx.Tile"(%arg0, %0) : (tensor<4x8xf32>, tensor<2xi64>) -> tensor<*xf32>
|
||||
return %1 : tensor<*xf32>
|
||||
// CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
|
||||
// CHECK-LABEL: test_tile1
|
||||
// CHECK: [[R0:%.+]] = alloc() : memref<12x16xf32>
|
||||
// CHECK: [[R1:%.+]] = "krnl.global"() {name = "constant_0", shape = [2], value = dense<[3, 2]> : tensor<2xi64>} : () -> memref<2xi64>
|
||||
// CHECK: [[R2:%.+]]:2 = krnl.define_loops 2
|
||||
// CHECK: krnl.iterate([[R2]]#0, [[R2]]#1) with ([[R2]]#0 -> [[ARG1:%.+]] = 0 to 12, [[R2]]#1 -> [[ARG2:%.+]] = 0 to 16) {
|
||||
// CHECK: [[C0:%.+]] = constant 0 : index
|
||||
// CHECK: [[R3:%.+]] = dim %arg0, [[C0]] : memref<4x8xf32>
|
||||
// CHECK: [[R4:%.+]] = affine.apply [[INDEX_MAP]]([[ARG1]]){{\[}}[[R3]]{{\]}}
|
||||
// CHECK: [[C1:%.+]] = constant 1 : index
|
||||
// CHECK: [[R5:%.+]] = dim %arg0, [[C1]] : memref<4x8xf32>
|
||||
// CHECK: [[R6:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R5]]{{\]}}
|
||||
// CHECK: [[R7:%.+]] = affine.load %arg0{{\[}}[[R4]], [[R6]]{{\]}} : memref<4x8xf32>
|
||||
// CHECK: affine.store [[R7]], %0{{\[}}[[ARG1]], [[ARG2]]{{\]}} : memref<12x16xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// Test Tile with 1D input and unknown repeats
|
||||
func @test_tile2(%arg0 : tensor<8xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
|
||||
%1 = "onnx.Tile"(%arg0, %arg1) : (tensor<8xf32>, tensor<1xi64>) -> tensor<*xf32>
|
||||
return %1 : tensor<*xf32>
|
||||
// CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
|
||||
// CHECK-LABEL test_tile2
|
||||
// CHECK: [[C0:%.+]] = constant 0 : index
|
||||
// CHECK: [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
|
||||
// CHECK: [[R1:%.+]] = index_cast [[R0]] : i64 to index
|
||||
// CHECK: [[C0_0:%.+]] = constant 0 : index
|
||||
// CHECK: [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<8xf32>
|
||||
// CHECK: [[R3:%.+]] = muli [[R2]], [[R1]] : index
|
||||
// CHECK: [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
|
||||
// CHECK: [[R5:%.+]] = krnl.define_loops 1
|
||||
// CHECK: [[C0_1:%.+]] = constant 0 : index
|
||||
// CHECK: [[R6:%.+]] = dim [[R4]], [[C0_1]] : memref<?xf32>
|
||||
// CHECK: krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
|
||||
// CHECK: [[C0_2:%.+]] = constant 0 : index
|
||||
// CHECK: [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<8xf32>
|
||||
// CHECK: [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
|
||||
// CHECK: [[R9:%.+]] = affine.load %arg0{{\[}}[[R8]]{{\]}} : memref<8xf32>
|
||||
// CHECK: affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// Test Tile with 1D unknown input
|
||||
func @test_tile3(%arg0 : tensor<?xf32>, %arg1 : tensor<1xi64>) -> tensor<*xf32> {
|
||||
%1 = "onnx.Tile"(%arg0, %arg1) : (tensor<?xf32>, tensor<1xi64>) -> tensor<*xf32>
|
||||
return %1 : tensor<*xf32>
|
||||
// CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
|
||||
// CHECK-LABEL test_tile3
|
||||
// CHECK: [[C0:%.+]] = constant 0 : index
|
||||
// CHECK: [[R0:%.+]] = affine.load %arg1{{\[}}[[C0]]{{\]}} : memref<1xi64>
|
||||
// CHECK: [[R1:%.+]] = index_cast [[R0]] : i64 to index
|
||||
// CHECK: [[C0_0:%.+]] = constant 0 : index
|
||||
// CHECK: [[R2:%.+]] = dim %arg0, [[C0_0]] : memref<?xf32>
|
||||
// CHECK: [[R3:%.+]] = muli [[R2]], [[R1]] : index
|
||||
// CHECK: [[R4:%.+]] = alloc([[R3]]) : memref<?xf32>
|
||||
// CHECK: [[R5:%.+]] = krnl.define_loops 1
|
||||
// CHECK: [[C0_1:%.+]] = constant 0 : index
|
||||
// CHECK: [[R6:%.+]] = dim %4, [[C0_1]] : memref<?xf32>
|
||||
// CHECK: krnl.iterate([[R5]]) with ([[R5]] -> [[ARG2:%.+]] = 0 to [[R6]]) {
|
||||
// CHECK: [[C0_2:%.+]] = constant 0 : index
|
||||
// CHECK: [[R7:%.+]] = dim %arg0, [[C0_2]] : memref<?xf32>
|
||||
// CHECK: [[R8:%.+]] = affine.apply [[INDEX_MAP]]([[ARG2]]){{\[}}[[R7]]{{\]}}
|
||||
// CHECK: [[R9:%.+]] = load %arg0{{\[}}[[R8]]{{\]}} : memref<?xf32>
|
||||
// CHECK: affine.store [[R9]], [[R4]]{{\[}}[[ARG2]]{{\]}} : memref<?xf32>
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue