Lower MaxPooling and AveragePool to Krnl dialect using AffineMap (#38)

* Create a template for pooling and add support for AveragePool

* Edit MLIR tests for MaxPool according to the new lowering template for pooling

* Dealloc temporary variables

* Support count_include_pad for AveragePool

* Add MLIR tests for AveragePool lowering

* Make changes according to Tian's comments

* Push AffineMap as upper bound for KrnlIterateOp

* Test AffineMap to use in Pooling

* Replace the old implementaion by a new one using AffineMap

* Fix the computation when dilations are non-unit

* Clean up the old code

* Remove AveragePool from Canonicalization pass

* Fix computing the end indices of a filter window

* Refactor the code for pooling

* Revise pushAffineMapBound

* Add MLIR tests

* Remove unused functions

* Fix check-onnx-backend build on x86 Linux. (#91)

* Add the split marker to test files (#90)

Co-authored-by: Tian Jin <tjingrant@gmail.com>

Co-authored-by: gongsu832 <gong_su@hotmail.com>
Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
Tung D. Le 2020-04-19 22:39:34 +09:00 committed by GitHub
parent e32f531546
commit eac2297624
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 636 additions and 517 deletions

View File

@ -8,6 +8,7 @@
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
#include "mlir/IR/AffineExpr.h"
#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp" #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
using namespace mlir; using namespace mlir;
@ -19,6 +20,19 @@ Value getIdentityValue<ONNXMaxPoolSingleOutOp>(
return emitNegativeInfinityConstantOp(rewriter, loc, type); return emitNegativeInfinityConstantOp(rewriter, loc, type);
} }
template <>
Value getIdentityValue<ONNXAveragePoolOp>(
ConversionPatternRewriter &rewriter, Location loc, Type type) {
return emitConstantOp(rewriter, loc, type, 0);
}
// Scalar operations
template <>
struct ScalarOp<ONNXAveragePoolOp> {
using FOp = AddFOp;
using IOp = AddIOp;
};
template <> template <>
Value emitScalarOpFor<ONNXMaxPoolSingleOutOp>( Value emitScalarOpFor<ONNXMaxPoolSingleOutOp>(
ConversionPatternRewriter &rewriter, Location loc, Operation *op, ConversionPatternRewriter &rewriter, Location loc, Operation *op,
@ -30,288 +44,519 @@ Value emitScalarOpFor<ONNXMaxPoolSingleOutOp>(
return result; return result;
} }
struct ONNXMaxPoolSingleOutOpLowering : public ConversionPattern { //===----------------------------------------------------------------------===//
ONNXMaxPoolSingleOutOpLowering(MLIRContext *ctx) // Get dilation values
: ConversionPattern( //
mlir::ONNXMaxPoolSingleOutOp::getOperationName(), 1, ctx) {} template <typename PoolOp>
std::vector<int64_t> getDilations(PoolOp poolOp) {
return {};
}
// MaxPool has dilations attribute.
template <>
std::vector<int64_t> getDilations<ONNXMaxPoolSingleOutOp>(
ONNXMaxPoolSingleOutOp poolOp) {
std::vector<int64_t> dilations;
auto dilationsAttribute = poolOp.dilationsAttr();
bool isDefaultDilations = true;
for (auto dilation : dilationsAttribute.getValue()) {
int64_t dilationValue = dilation.cast<IntegerAttr>().getInt();
if (dilationValue > 1 and isDefaultDilations)
isDefaultDilations = false;
dilations.emplace_back(dilationValue);
}
if (isDefaultDilations)
return {};
else
return dilations;
}
//===----------------------------------------------------------------------===//
// Get count_include_pad values
//
template <typename PoolOp>
bool getCountIncludePad(PoolOp poolOp) {
return false;
}
// AveragePool has count_include_pad attribute.
template <>
bool getCountIncludePad<ONNXAveragePoolOp>(ONNXAveragePoolOp poolOp) {
return (poolOp.count_include_pad() == 1);
}
//===----------------------------------------------------------------------===//
// Helper function to do post-processing after applying a filter window.
//
template <typename PoolOp>
void postProcessPoolingWindow(ConversionPatternRewriter &rewriter, Location loc,
PoolOp poolOp, Value alloc, ArrayRef<Value> resultIndices,
ArrayRef<int64_t> kernelShape, ArrayRef<Value> poolDimValues) {}
// Calculate the average value for AveragePool.
template <>
void postProcessPoolingWindow<ONNXAveragePoolOp>(
ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
Value alloc, ArrayRef<Value> resultIndices, ArrayRef<int64_t> kernelShape,
ArrayRef<Value> poolDimValues) {
// AveragePool's result type is FloatType, so it's safe to use DivFOp, SubFOp.
bool countIncludePad = getCountIncludePad<ONNXAveragePoolOp>(poolOp);
Value numerator = rewriter.create<LoadOp>(loc, alloc, resultIndices);
Value denominator;
if (countIncludePad) {
int64_t kernelSize = 1;
for (int i = 0; i < kernelShape.size(); ++i)
kernelSize *= kernelShape[i];
denominator =
emitConstantOp(rewriter, loc, numerator.getType(), kernelSize);
} else {
denominator = poolDimValues[0];
for (int i = 1; i < poolDimValues.size(); ++i)
denominator = rewriter.create<MulIOp>(loc, denominator, poolDimValues[i]);
denominator = rewriter.create<IndexCastOp>(
loc, denominator, rewriter.getIntegerType(64));
denominator =
rewriter.create<SIToFPOp>(loc, denominator, numerator.getType());
}
Value average = rewriter.create<DivFOp>(loc, numerator, denominator);
rewriter.create<StoreOp>(loc, average, alloc, resultIndices);
}
//===----------------------------------------------------------------------===//
// Helper function to insert alloc and dealloc ops for memref of dynamic shape.
//
Value insertAllocAndDeallocForPooling(ConversionPatternRewriter &rewriter,
Location loc, bool insertDealloc, MemRefType memRefType, Value inputOperand,
ArrayRef<int64_t> kernelShape, ArrayRef<int64_t> pads,
ArrayRef<int64_t> strides, ArrayRef<int64_t> dilations, bool ceilMode) {
AllocOp alloc;
// Shape and rank information related to result and kernel.
auto resultShape = memRefType.getShape();
auto resultRank = resultShape.size();
auto kernelRank = kernelShape.size();
auto kernelOffset = resultRank - kernelRank;
// Compute dimensions of the result of this operation.
SmallVector<Value, 2> allocOperands;
for (int i = 0; i < kernelOffset; ++i) {
if (resultShape[i] < 0) {
auto dim = rewriter.create<DimOp>(loc, inputOperand, i);
allocOperands.emplace_back(dim);
}
}
Value zero, one;
if (ceilMode) {
zero = rewriter.create<ConstantOp>(
loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0));
}
one = rewriter.create<ConstantOp>(
loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 1));
for (int i = kernelOffset; i < resultShape.size(); ++i) {
if (resultShape[i] < 0) {
// dim =
// let numerator = (input + pad - (kernel - 1) * dilation - 1)
// in let denominator = stride
// in
// if (ceilMode)
// ceil(numerator / denominator) + 1
// else
// floor(numerator / denominator) + 1
int spatialIndex = i - kernelOffset;
// numerator = (input + pad - (kernel - 1) * dilation - 1)
int64_t dilation = dilations.empty() ? 1 : dilations[spatialIndex];
int64_t padKernelDilation =
(pads[spatialIndex] + pads[spatialIndex + kernelRank]) -
(kernelShape[spatialIndex] - 1) * dilation - 1;
auto padKernelDilationVal = emitConstantOp(
rewriter, loc, rewriter.getIntegerType(64), padKernelDilation);
auto inputDim = rewriter.create<DimOp>(loc, inputOperand, i);
auto inputDimVal = rewriter.create<IndexCastOp>(
loc, inputDim, rewriter.getIntegerType(64));
auto numeratorVal =
rewriter.create<AddIOp>(loc, inputDimVal, padKernelDilationVal);
// denominator
auto denominatorVal = emitConstantOp(
rewriter, loc, rewriter.getIntegerType(64), strides[spatialIndex]);
// numerator / denominator
Value dimVal =
rewriter.create<SignedDivIOp>(loc, numeratorVal, denominatorVal);
if (ceilMode) {
auto remainder =
rewriter.create<SignedRemIOp>(loc, numeratorVal, denominatorVal);
auto isZero =
rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, remainder, zero);
auto dimPlusOne = rewriter.create<AddIOp>(loc, dimVal, one);
dimVal = rewriter.create<SelectOp>(loc, isZero, dimVal, dimPlusOne);
}
dimVal = rewriter.create<AddIOp>(loc, dimVal, one);
allocOperands.emplace_back(
rewriter.create<IndexCastOp>(loc, dimVal, rewriter.getIndexType()));
}
}
alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
if (insertDealloc) {
auto *parentBlock = alloc.getOperation()->getBlock();
auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
dealloc.getOperation()->moveBefore(&parentBlock->back());
}
return alloc;
}
//===----------------------------------------------------------------------===//
// Template function that does pooling.
//
template <typename PoolOp>
struct ONNXPoolOpLowering : public ConversionPattern {
ONNXPoolOpLowering(MLIRContext *ctx)
: ConversionPattern(PoolOp::getOperationName(), 1, ctx) {}
LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands, LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const final { ConversionPatternRewriter &rewriter) const final {
ONNXMaxPoolSingleOutOpOperandAdaptor operandAdaptor(operands); ONNXMaxPoolSingleOutOpOperandAdaptor operandAdaptor(operands);
auto loc = op->getLoc(); auto loc = op->getLoc();
// Match PoolOp poolOp = llvm::dyn_cast<PoolOp>(op);
ONNXMaxPoolSingleOutOp poolOp = llvm::dyn_cast<ONNXMaxPoolSingleOutOp>(op);
// Read kernel_shape attribute // Read kernel_shape attribute
SmallVector<int, 4> kernelShape; SmallVector<int64_t, 4> kernelShape;
auto kernelShapeAttribute = poolOp.kernel_shapeAttr(); auto kernelShapeAttribute = poolOp.kernel_shapeAttr();
for (auto dim : kernelShapeAttribute.getValue()) for (Attribute dim : kernelShapeAttribute.getValue())
kernelShape.emplace_back(dim.cast<IntegerAttr>().getInt()); kernelShape.emplace_back(dim.cast<IntegerAttr>().getInt());
// Read strides attribute // Read strides attribute
SmallVector<int, 4> strides; SmallVector<int64_t, 4> strides;
auto stridesAttribute = poolOp.stridesAttr(); auto stridesAttribute = poolOp.stridesAttr();
for (auto stride : stridesAttribute.getValue()) for (Attribute stride : stridesAttribute.getValue())
strides.emplace_back(stride.cast<IntegerAttr>().getInt()); strides.emplace_back(stride.cast<IntegerAttr>().getInt());
// Read ceil_mode attribute // Read ceil_mode attribute
auto ceilMode = poolOp.ceil_mode().getSExtValue(); auto ceilMode = poolOp.ceil_mode().getSExtValue();
// Read pads attribute // Read pads attribute
SmallVector<int, 4> pads; SmallVector<int64_t, 4> pads;
auto padsAttribute = poolOp.padsAttr(); auto padsAttribute = poolOp.padsAttr();
for (auto pad : padsAttribute.getValue()) for (Attribute pad : padsAttribute.getValue())
pads.emplace_back(pad.cast<IntegerAttr>().getInt()); pads.emplace_back(pad.cast<IntegerAttr>().getInt());
// Read dilations attribute // Read dilations attribute if the op has.
SmallVector<int, 4> dilations; std::vector<int64_t> dilations = getDilations<PoolOp>(poolOp);
auto dilationsAttribute = poolOp.dilationsAttr(); bool isDilated = !dilations.empty();
for (auto dilation : dilationsAttribute.getValue())
dilations.emplace_back(dilation.cast<IntegerAttr>().getInt());
// Type information about the input and result of this operation. // Type information about the input and result of this operation.
auto inputOperand = operandAdaptor.X(); auto inputOperand = operandAdaptor.X();
auto inputShape = inputOperand.getType().cast<MemRefType>().getShape(); auto inputShape = inputOperand.getType().cast<MemRefType>().getShape();
auto memRefType = convertToMemRefType(*op->result_type_begin()); auto memRefType = convertToMemRefType(*op->result_type_begin());
auto resultShape = memRefType.getShape(); auto outputShape = memRefType.getShape();
auto resultElementType = memRefType.getElementType(); auto outputElementType = memRefType.getElementType();
// Batch indices: N and C dimensions // Kernel offset in the input shape.
int batchRank = 2; int kernelOffset = inputShape.size() - kernelShape.size();
// Insert an allocation and deallocation for the result of this operation. // Insert an allocation and deallocation for the output of this operation.
Value alloc; Value alloc;
bool insertDealloc = checkInsertDealloc(op); bool insertDealloc = checkInsertDealloc(op);
if (hasAllConstantDimensions(memRefType)) if (hasAllConstantDimensions(memRefType))
alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc); alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
else { else {
// Compute dimensions of the result of this operation. alloc = insertAllocAndDeallocForPooling(rewriter, loc, insertDealloc,
SmallVector<Value, 2> allocOperands; memRefType, inputOperand, kernelShape, pads, strides, dilations,
for (int i = 0; i < batchRank; ++i) { ceilMode);
if (resultShape[i] < 0) {
auto dim = rewriter.create<DimOp>(loc, inputOperand, i);
allocOperands.emplace_back(dim);
}
}
Value zero, one;
if (ceilMode) {
zero = rewriter.create<ConstantOp>(
loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0));
}
one = rewriter.create<ConstantOp>(
loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 1));
int spatialRank = resultShape.size() - batchRank;
for (int i = batchRank; i < resultShape.size(); ++i) {
if (resultShape[i] < 0) {
// dim =
// let numerator = (input + pad - (kernel - 1) * dilation - 1)
// in let denomitor = stride
// in
// if (ceilMode)
// ceil(numerator / denominator) + 1
// else
// floor(numerator / denominator) + 1
int spatialIndex = i - batchRank;
// numerator = (input + pad - (kernel - 1) * dilation - 1)
auto inputDim = rewriter.create<DimOp>(loc, inputOperand, i);
auto inputVal = rewriter.create<IndexCastOp>(
loc, inputDim, rewriter.getIntegerType(64));
int64_t padKernelDilation =
(pads[spatialIndex] + pads[spatialIndex + spatialRank]) -
(kernelShape[spatialIndex] - 1) * dilations[spatialIndex] - 1;
auto padKernelDilationVal = rewriter.create<ConstantOp>(
loc, rewriter.getIntegerAttr(
rewriter.getIntegerType(64), padKernelDilation));
auto numeratorVal =
rewriter.create<AddIOp>(loc, inputVal, padKernelDilationVal);
// denominator
auto denominatorVal = rewriter.create<ConstantOp>(
loc, rewriter.getIntegerAttr(
rewriter.getIntegerType(64), strides[spatialIndex]));
// numerator / denominator
Value dimVal =
rewriter.create<SignedDivIOp>(loc, numeratorVal, denominatorVal);
if (ceilMode) {
auto remainder = rewriter.create<SignedRemIOp>(
loc, numeratorVal, denominatorVal);
auto isZero = rewriter.create<CmpIOp>(
loc, CmpIPredicate::eq, remainder, zero);
auto dimPlusOne = rewriter.create<AddIOp>(loc, dimVal, one);
dimVal = rewriter.create<SelectOp>(loc, isZero, dimVal, dimPlusOne);
}
dimVal = rewriter.create<AddIOp>(loc, dimVal, one);
allocOperands.emplace_back(rewriter.create<IndexCastOp>(
loc, dimVal, rewriter.getIndexType()));
}
}
alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
if (insertDealloc) {
auto *parentBlock = alloc.getDefiningOp()->getBlock();
auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
dealloc.getOperation()->moveBefore(&parentBlock->back());
}
} }
// R = MaxPool(D) // input = Pool(output)
// //
// The input/output shapes will look like this: // The input/output shapes will look like this:
// //
// D (NxCxHxW) -> R (NxCxRHxRW) // input (NxCxHxW) -> output (NxCxHOxWO)
// //
// The loop nest will look as follows: // The loop nest will look as follows:
// //
// strides = [s1, s2] // kernelShape = [kH, kW]
// pads = [ptH, ptW, pbH, pbW]
// strides = [sH, sW]
// dilations = [dH, dW]
// round = ceil if ceilMode else floor
// //
// for n = 0 .. N: // for n in range(N):
// for c = 0 .. C: // for c in range(C):
// for r1 = 0 .. RH: // for ho in range(HO):
// for r2 = 0 .. RW: // for wo in range(WO):
// R[n][c][r1][r2] = negative_infinity; // # Initialize values for the output.
// for k1 = 0 .. KH: // output[n][c][ho][wo] = getIdentityValue(...);
// for k2 = 0 .. KW:
// t = D[n][c][s1 * r1 + k1 * d1][s2 * r2 + k2 * d2];
// R[n][c][r1][r2] = max(R[n][c][r1][r2], t);
// //
// Naming: // # Thanks to Tian (@tjingrant) for the following derivation about
// n, c, r1, r2: outer loop nest indices // # firstValid.
// k1, k2: inner loop nest indices // # When dilation is non-unit, the first valid pixel to
// s1, s2: strides // # apply pooling on will not be the 0-th pixel, but rather
// d1, d2: dilations // # the smallest integer n to make -pH + n * 3 greater than
// # or equal to 0.
// # We derive what is this smallest n:
// # -pH + n * dH >= 0
// # n * dH >= pH
// # n >= pH/dH
// # thus n = ceil(pH/dH)
// # thus the first valid pixel location is
// # ceil(pH / dilation) * dilation - pH
// //
// TODO: handle padding. // firstValidH = ceil(float(ptH / dH)) * dH - ptH
// startH = max(firstValidH, ho * sH - ptH)
// endH = min(H, ho * sH + (kH -1) * dH + 1 - ptH)
//
// firstValidW= ceil(float(pW / dW)) * dW - ptW
// startW = max(firstValidW, wo * sW - ptW)
// endW = min(W, wo * sW + (kW - 1) * dW + 1 - ptW)
//
// hDim= round(float(endH - startH) / float(dH))
// wDim= round(float(endW - startW) / float(dW))
//
// # Apply the pooling window.
// # The pooling window can be smaller than the kernel when slicing
// # over the border edges.
// for hi in range(startH, endH, dH):
// for wi in range(startW, endW, dW):
// output[n, c, ho, wo] = emitScalarOpFor(output[n, c, ho, wo],
// input[n, c, hi, wi]);
//
// # The above two for-loops are rewritten as follows:
// # (since KrnlIterateOp has not supported `step` yet)
// for hp in range(hDim):
// for wp in range(wDim):
// hi = hp * dH + startH
// wi = wp * dW + startW
// output[n, c, ho, wo] = emitScalarOpFor(output[n, c, ho, wo],
// input[n, c, hi, wi]);
//
// # Do post processing such as taking average pooling:
// postProcessPoolingWindow(...)
//
// Helper functions:
// getIdentityValue(): to return the indentity value
// - negative infinity for MaxPool
// - 0 for AveragePool
// emitScalarOpFor(): to do primitive computation for Pooling, e.g.
// - compute max for MaxPool
// - compute sum for AveragePool
// postProcessPoolingWindow(): to do post processing over the whole
// pooling window, e.g.
// - do nothing in case of MaxPool
// - calculate the average in case of AveragePool, e.g.
// if hDim * wDim> 0:
// output[n, c, ho, wo] = output[n, c, ho, wo] / (hDim*wDim)
// //
// 1. Define outer loops and emit empty optimization block. // Identity value of the operation.
auto nOuterLoops = resultShape.size(); auto identity = getIdentityValue<PoolOp>(rewriter, loc, outputElementType);
BuildKrnlLoop outerLoops(rewriter, loc, nOuterLoops);
outerLoops.createDefineOptimizeAndIterateOp(alloc);
rewriter.setInsertionPointToStart(outerLoops.getIterateBlock()); // 1. Define output loops to compute one output pixel.
// for n in range(N):
// for c in range(C):
// for ho in range(HO):
// for wo in range(WO):
BuildKrnlLoop outputLoops(rewriter, loc, outputShape.size());
outputLoops.createDefineOptimizeAndIterateOp(alloc);
auto ipMainRegion = rewriter.saveInsertionPoint();
rewriter.setInsertionPointToStart(outputLoops.getIterateBlock());
{ {
// 2. Emit the body of the outer loop nest. // 2. Emit the body of the output loop nest, which applies a pooling
SmallVector<Value, 4> resultIndices; // window to a region in the input, producing one output pixel.
for (int i = 0; i < nOuterLoops; ++i) SmallVector<Value, 4> outputIndices;
resultIndices.emplace_back(outerLoops.getInductionVar(i)); for (int i = 0; i < outputShape.size(); ++i)
outputIndices.emplace_back(outputLoops.getInductionVar(i));
// 2.1 Emit: R[n][c][r1][r2] = negative_infinity; // 2.1 Emit: output[n][c][ho][wo] = identity
Value identity = getIdentityValue<ONNXMaxPoolSingleOutOp>( rewriter.create<StoreOp>(loc, identity, alloc, outputIndices);
rewriter, loc, resultElementType);
rewriter.create<StoreOp>(loc, identity, alloc, resultIndices);
// 2.2 Define inner loops. // 2.2 Emit affine maps which express the lower and upper bounds for the
int nInnerLoops = kernelShape.size(); // pooling window's dimensions.
BuildKrnlLoop innerLoops(rewriter, loc, nInnerLoops); // The pooling window can be smaller than the kernel when slicing it over
innerLoops.createDefineAndOptimizeOp(); // the border edges. Thus, we will compute the start and end indices for
// for Kx = 0 .. KX // each dimension as follows.
for (int i = 0; i < nInnerLoops; ++i) // firstValidH = ceil(float(ptH / dH)) * dH - ptH
innerLoops.pushBounds(0, kernelShape[i]); // startH = max(firstValidH, ho * sH - ptH)
// endH = min(H, ho * sH + (kH - 1) * dH + 1 - pbH)
// hDim = round(float(endH - startH) / float(dH))
// 2.3 Emit inner loop nest. // Prepare induction variables and constants as arguments for the affine
innerLoops.createIterateOp(); // maps.
rewriter.setInsertionPointToStart(innerLoops.getIterateBlock()); SmallVector<SmallVector<Value, 4>, 4> IVsAndConstants;
{ { // Construct IVsAndConstants.
// 3. Emit inner loop body for (int i = 0; i < kernelShape.size(); ++i) {
// t = D[n][c][s1 * r1 + k1 * d1][s2 * r2 + k2 * d2]; SmallVector<Value, 4> ic;
// R[n][c][r1][r2] = max(R[n][c][r1][r2], t); // d0, output
ic.emplace_back(outputLoops.getInductionVar(i + kernelOffset));
// 3.1 Prepare indices for accesing the data tensor. // s0, input dim
SmallVector<Value, 4> dataIndices; if (inputShape[i + kernelOffset] < 0) {
// 3.1.1 Batch indices: n, c ic.emplace_back(
for (int i = 0; i < batchRank; ++i) rewriter.create<DimOp>(loc, inputOperand, i + kernelOffset));
dataIndices.emplace_back(outerLoops.getInductionVar(i));
// 3.1.2 Insert spatial indices: sX * rX + kX * dX
for (int i = batchRank; i < nOuterLoops; ++i) {
// Get index along the inner loop's induction variables.
// It is used to obtain kernel/pad/stride/dilation index.
int j = i - batchRank;
Value spatialIndex = outerLoops.getInductionVar(i);
// If strides are present (not default) then emit the correct access
// index.
// sX *= rX
if (strides[i - batchRank] > 1) {
auto strideIndex = emitConstantOp(
rewriter, loc, rewriter.getIndexType(), strides[j]);
spatialIndex = rewriter.create<MulIOp>(
loc, strideIndex, outerLoops.getInductionVar(i));
}
// Dilate the kernel index only if the dilation value is not one (not
// default). Otherwise, just add kernelIndex.
auto kernelIndex = innerLoops.getInductionVar(j);
if (dilations[j] > 1) {
// sX += dX * kW
auto dilationIndex = emitConstantOp(
rewriter, loc, rewriter.getIndexType(), dilations[j]);
auto dilationKernelIndex =
rewriter.create<MulIOp>(loc, dilationIndex, kernelIndex);
spatialIndex =
rewriter.create<AddIOp>(loc, spatialIndex, dilationKernelIndex);
} else { } else {
// sX += kX ic.emplace_back(emitConstantOp(rewriter, loc,
spatialIndex = rewriter.getIndexType(), inputShape[i + kernelOffset]));
rewriter.create<AddIOp>(loc, spatialIndex, kernelIndex);
} }
// s1, kernel dim
ic.emplace_back(emitConstantOp(
rewriter, loc, rewriter.getIndexType(), kernelShape[i]));
// s2, pad dim
ic.emplace_back(
emitConstantOp(rewriter, loc, rewriter.getIndexType(), pads[i]));
// s3, stride dim
ic.emplace_back(emitConstantOp(
rewriter, loc, rewriter.getIndexType(), strides[i]));
// s4, dilation dim
ic.emplace_back(emitConstantOp(rewriter, loc, rewriter.getIndexType(),
(isDilated) ? dilations[i] : 1));
IVsAndConstants.emplace_back(ic);
}
}
// If ceil mode or dilation is enabled, then the calculated access // Affine maps for the pooling window.
// index may exceed its dimension. In such a case, we will use the AffineMap poolStartMap, poolEndMap, poolDimMap;
// maximum index, which causes multiple visits to the element of the { // Construct poolStartMap, poolEndMap and poolDimMap.
// maximum index. // AffineExpr(s) to obtain the dimensions and symbols.
// TODO: Avoid multiple visits. AffineExpr outputIndex = rewriter.getAffineDimExpr(0);
// Example of out-of-bound. AffineExpr inputDim = rewriter.getAffineSymbolExpr(0);
// - Given a 5x5 input X AffineExpr kernelDim = rewriter.getAffineSymbolExpr(1);
// X = [[0, 0, 0, 0, 0], AffineExpr padTopDim = rewriter.getAffineSymbolExpr(2);
// [1, 1, 1, 1, 1], AffineExpr strideDim = rewriter.getAffineSymbolExpr(3);
// [2, 2, 2, 2, 2], AffineExpr dilationDim = rewriter.getAffineSymbolExpr(4);
// [3, 3, 3, 3, 3], AffineExpr start1 =
// [4, 4, 4, 4, 4]] (padTopDim).ceilDiv(dilationDim) * dilationDim - padTopDim;
// - Do MaxPool with strides=[2, 2], kernel=[2, 2], ceilMode=true, AffineExpr start2 = outputIndex * strideDim - padTopDim;
// output is a 3x3 array: AffineExpr end1 = inputDim;
// Y = [[1, 1, 1], AffineExpr end2 = outputIndex * strideDim +
// [3, 3, 3], (kernelDim - 1) * dilationDim + 1 - padTopDim;
// [4, 4, 4]]
// - When computing Y[2, 0]: // poolDimMap
// - In case of kernelIndex = 1, stride = 2 SmallVector<AffineExpr, 4> dimExpr;
// - No dilation: spatialIndex = 2 * 2 + 1 = 5 // Upperbound for an affine.for is `min AffineMap`, where `min` is
// => out of bound // automatically inserted when an affine.for is constructed from
// - dilation = 2: spatialIndex = 2 * 2 + 2 * 1 = 6 // an AffineMap, thus we rewrite `endH - startH` as follows:
// => out of bound // endH - start H
if (dilations[j] > 1 or ceilMode) { // = min(end1, end2) - max(start1, start2)
Value upperIndex; // = min(end1 - start1, end1 - start2, end2 - start1, end2 - start2)
if (inputShape[i] < 0) { AffineExpr dimExpr1 = end1 - start1;
Value inputDim = rewriter.create<DimOp>(loc, inputOperand, i); AffineExpr dimExpr2 = end1 - start2;
Value one = rewriter.create<ConstantIndexOp>(loc, 1); AffineExpr dimExpr3 = end2 - start1;
upperIndex = rewriter.create<SubIOp>(loc, inputDim, one); AffineExpr dimExpr4 = end2 - start2;
} else { for (AffineExpr de : {dimExpr1, dimExpr2, dimExpr3, dimExpr4}) {
upperIndex = if (isDilated) {
rewriter.create<ConstantIndexOp>(loc, inputShape[i] - 1); de = de + 1;
de =
(ceilMode) ? de.ceilDiv(dilationDim) : de.floorDiv(dilationDim);
}
dimExpr.emplace_back(de);
}
poolDimMap = AffineMap::get(1, 5, dimExpr);
// poolStartMap and poolEndMap
poolStartMap = AffineMap::get(1, 5, {start1, start2});
poolEndMap = AffineMap::get(1, 5, {end1, end2});
}
// Obtain values from the affine maps.
SmallVector<Value, 4> poolStartValues;
SmallVector<Value, 4> poolDimValues;
{ // Construct poolStartValues and poolDimValues.
for (int i = 0; i < kernelShape.size(); ++i) {
Value startIndex = rewriter.create<AffineMaxOp>(
loc, poolStartMap, ValueRange(IVsAndConstants[i]));
poolStartValues.emplace_back(startIndex);
Value endIndex = rewriter.create<AffineMinOp>(
loc, poolEndMap, ValueRange(IVsAndConstants[i]));
Value dim = rewriter.create<SubIOp>(loc, endIndex, startIndex);
if (isDilated) {
Value one =
emitConstantOp(rewriter, loc, rewriter.getIndexType(), 1);
Value numerator = rewriter.create<AddIOp>(loc, dim, one);
Value denominator = IVsAndConstants[i][5]; // dilations[i]
dim = rewriter.create<SignedDivIOp>(loc, numerator, denominator);
if (ceilMode) {
auto remainder =
rewriter.create<SignedRemIOp>(loc, numerator, denominator);
Value zero =
emitConstantOp(rewriter, loc, rewriter.getIndexType(), 0);
auto isZero = rewriter.create<CmpIOp>(
loc, CmpIPredicate::eq, remainder, zero);
auto dimPlusOne = rewriter.create<AddIOp>(loc, dim, one);
dim = rewriter.create<SelectOp>(loc, isZero, dim, dimPlusOne);
} }
auto greaterCondition = rewriter.create<CmpIOp>(
loc, CmpIPredicate::sgt, spatialIndex, upperIndex);
spatialIndex = rewriter.create<SelectOp>(
loc, greaterCondition, upperIndex, spatialIndex);
} }
poolDimValues.emplace_back(dim);
}
}
dataIndices.emplace_back(spatialIndex); // 2.3 Define pooling loops.
// for hp in range(hDim):
// for wp in range(wDim):
// hi = hp * dH + startH
// wi = wp * dW + startW
// output[n][c][ho][wo] =
// emitScalarOpFor(output[n][c][ho][wo], input[n, c, hi, wi]);
BuildKrnlLoop poolingLoops(rewriter, loc, kernelShape.size());
poolingLoops.createDefineAndOptimizeOp();
for (int i = 0; i < kernelShape.size(); ++i)
poolingLoops.pushBounds(
0, poolDimMap, llvm::makeArrayRef(IVsAndConstants[i]));
poolingLoops.createIterateOp();
auto ipOuterLoops = rewriter.saveInsertionPoint();
rewriter.setInsertionPointToStart(poolingLoops.getIterateBlock());
{
// 2.4 Emit the body of the pooling loop nest.
// Prepare indices to access a pixel in the input.
std::vector<Value> inputIndices;
{ // Construct inputIndices
for (int i = 0; i < kernelOffset; ++i)
inputIndices.emplace_back(outputIndices[i]);
for (int i = kernelOffset; i < inputShape.size(); ++i) {
int j = i - kernelOffset;
if (isDilated) {
// hi = hp * dH + startH
Value index = rewriter.create<MulIOp>(
loc, poolingLoops.getInductionVar(j), IVsAndConstants[j][5]);
index = rewriter.create<AddIOp>(loc, index, poolStartValues[j]);
inputIndices.emplace_back(index);
} else {
// hi = hp + startH
inputIndices.emplace_back(rewriter.create<AddIOp>(
loc, poolingLoops.getInductionVar(j), poolStartValues[j]));
}
}
} }
// 3.2 Do pooling. // Apply pooling operation.
auto loadData = rewriter.create<LoadOp>(loc, inputOperand, dataIndices); // output[n][c][ho][wo] =
auto loadPartialResult = // emitScalarOpFor(output[n][c][ho][wo], input[n, c, hi, wi]);
rewriter.create<LoadOp>(loc, alloc, resultIndices); Value loadInput =
Value result = emitScalarOpFor<ONNXMaxPoolSingleOutOp>(rewriter, loc, rewriter.create<LoadOp>(loc, inputOperand, inputIndices);
op, resultElementType, {loadPartialResult, loadData}); Value loadPartialOutput =
rewriter.create<StoreOp>(loc, result, alloc, resultIndices); rewriter.create<LoadOp>(loc, alloc, outputIndices);
Value output = emitScalarOpFor<PoolOp>(rewriter, loc, op,
outputElementType, {loadPartialOutput, loadInput});
rewriter.create<StoreOp>(loc, output, alloc, outputIndices);
} }
// 2.5 Post-processing for the pooling window, e.g. taking average.
rewriter.restoreInsertionPoint(ipOuterLoops);
postProcessPoolingWindow<PoolOp>(rewriter, loc, poolOp, alloc,
outputIndices, kernelShape, poolDimValues);
} }
// Go back to the main region.
rewriter.restoreInsertionPoint(ipMainRegion);
rewriter.replaceOp(op, alloc); rewriter.replaceOp(op, alloc);
return success(); return success();
@ -320,5 +565,6 @@ struct ONNXMaxPoolSingleOutOpLowering : public ConversionPattern {
void populateLoweringONNXPoolingOpPattern( void populateLoweringONNXPoolingOpPattern(
OwningRewritePatternList &patterns, MLIRContext *ctx) { OwningRewritePatternList &patterns, MLIRContext *ctx) {
patterns.insert<ONNXMaxPoolSingleOutOpLowering>(ctx); patterns.insert<ONNXPoolOpLowering<ONNXMaxPoolSingleOutOp>>(ctx);
patterns.insert<ONNXPoolOpLowering<ONNXAveragePoolOp>>(ctx);
} }

View File

@ -149,6 +149,15 @@ void KrnlIterateOperandPack::pushOperandBound(Value operand) {
_operands.emplace_back(operand); _operands.emplace_back(operand);
} }
void KrnlIterateOperandPack::pushAffineMapBound(
AffineMap map, ArrayRef<Value> operands) {
if (boundMaps.size() % 2 == 0)
_operands.emplace_back(inputLoops[boundMaps.size() / 2]);
boundMaps.emplace_back(AffineMapAttr::get(map));
for (auto operand : operands)
_operands.emplace_back(operand);
}
BuildKrnlLoop::BuildKrnlLoop( BuildKrnlLoop::BuildKrnlLoop(
ConversionPatternRewriter &rewriter, Location loc, int loopNum) ConversionPatternRewriter &rewriter, Location loc, int loopNum)
: rewriter(rewriter), loc(loc), originalLoopNum(loopNum), pack(NULL), : rewriter(rewriter), loc(loc), originalLoopNum(loopNum), pack(NULL),
@ -209,6 +218,13 @@ int BuildKrnlLoop::pushBounds(int64_t lowerBound, Value upperBound) {
return pushCount++; return pushCount++;
} }
int BuildKrnlLoop::pushBounds(int64_t lowerBound, AffineMap upperBound,
ArrayRef<Value> operandsForUpperBoundMap) {
pack->pushConstantBound(lowerBound);
pack->pushAffineMapBound(upperBound, operandsForUpperBoundMap);
return pushCount++;
}
int BuildKrnlLoop::pushBounds(int64_t lowerBound, Value upperBoundMemRefOperand, int BuildKrnlLoop::pushBounds(int64_t lowerBound, Value upperBoundMemRefOperand,
int upperBoundMemRefIndex, bool upperBoundMustBeConstant) { int upperBoundMemRefIndex, bool upperBoundMustBeConstant) {
pack->pushConstantBound(lowerBound); pack->pushConstantBound(lowerBound);

View File

@ -87,6 +87,8 @@ struct KrnlIterateOperandPack {
void pushOperandBound(mlir::Value operand); void pushOperandBound(mlir::Value operand);
void pushAffineMapBound(mlir::AffineMap map, ArrayRef<Value> operands);
llvm::SmallVector<mlir::Value, 8> getOperands() const { return _operands; } llvm::SmallVector<mlir::Value, 8> getOperands() const { return _operands; }
mlir::ArrayAttr getAttributes() const { mlir::ArrayAttr getAttributes() const {
@ -159,6 +161,8 @@ public:
// must be of MemRef type. // must be of MemRef type.
int pushBounds(int64_t lowerBound, int64_t upperBound); int pushBounds(int64_t lowerBound, int64_t upperBound);
int pushBounds(int64_t lowerBound, Value upperBound); int pushBounds(int64_t lowerBound, Value upperBound);
int pushBounds(int64_t lowerBound, AffineMap upperBound,
ArrayRef<Value> operandsForUpperBoundMap);
int pushBounds(Value lowerBound, Value upperBound); int pushBounds(Value lowerBound, Value upperBound);
int pushBounds(int64_t lowerBound, Value upperBoundMemRefOperand, int pushBounds(int64_t lowerBound, Value upperBoundMemRefOperand,
int upperBoundMemRefIndex, bool upperBoundMustBeConstant = false); int upperBoundMemRefIndex, bool upperBoundMustBeConstant = false);

View File

@ -98,7 +98,6 @@ def ONNXEntryPointOp: ONNX_Op<"EntryPoint"> {
def ONNXMaxPoolSingleOutOp: ONNX_Op<"MaxPoolSingleOut", def ONNXMaxPoolSingleOutOp: ONNX_Op<"MaxPoolSingleOut",
[NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> { [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
let hasCanonicalizer = 1;
let summary = "ONNX MaxPool operation with a single output."; let summary = "ONNX MaxPool operation with a single output.";
let description = [{ let description = [{
"ONNX MaxPool operation with a single output." "ONNX MaxPool operation with a single output."

View File

@ -33,7 +33,7 @@ bool hasNonZeroInArrayAttr(ArrayAttr attrs) {
} }
// Create an ArrayAttr of IntergerAttr(s) of zero values. // Create an ArrayAttr of IntergerAttr(s) of zero values.
// This function is used for padding attribute in MaxPoolSingleOut. // This function is used for padding attribute in Conv.
ArrayAttr createArrayAttrOfZeros( ArrayAttr createArrayAttrOfZeros(
PatternRewriter &rewriter, ArrayAttr origAttrs) { PatternRewriter &rewriter, ArrayAttr origAttrs) {
int nElements = origAttrs.getValue().size(); int nElements = origAttrs.getValue().size();
@ -51,7 +51,7 @@ ArrayAttr createArrayAttrOfZeros(
// |_____| |_____| // |_____| |_____|
// nZeros nZeros // nZeros nZeros
// //
// This function is used for padding attribute in MaxPoolSingleOut. // This function is used for padding attribute in Conv.
ArrayAttr insertZerosForNonPaddedDims( ArrayAttr insertZerosForNonPaddedDims(
PatternRewriter &rewriter, ArrayAttr origAttrs, int extensionLength) { PatternRewriter &rewriter, ArrayAttr origAttrs, int extensionLength) {
int nDims = (int)origAttrs.getValue().size() / 2; int nDims = (int)origAttrs.getValue().size() / 2;
@ -72,11 +72,6 @@ ArrayAttr insertZerosForNonPaddedDims(
} // end anonymous namespace } // end anonymous namespace
/// on the ONNXMaxPoolSingleOutOp.
void ONNXMaxPoolSingleOutOp::getCanonicalizationPatterns(
OwningRewritePatternList &results, MLIRContext *context) {
results.insert<MaxPoolSingleOutOpPaddingPattern>(context);
}
/// on the ONNXConvOp. /// on the ONNXConvOp.
void ONNXConvOp::getCanonicalizationPatterns( void ONNXConvOp::getCanonicalizationPatterns(
OwningRewritePatternList &results, MLIRContext *context) { OwningRewritePatternList &results, MLIRContext *context) {

View File

@ -33,13 +33,8 @@ class StringAttrOfValue<string val>:
class FloatAttrOfValue<int val>: class FloatAttrOfValue<int val>:
NativeCodeCall<"FloatAttr::get($0.getType().cast<TensorType>().getElementType(), " # val # ")">; NativeCodeCall<"FloatAttr::get($0.getType().cast<TensorType>().getElementType(), " # val # ")">;
// Create a FloatAttr for the negative infinity.
def FloatAttrOfNegativeInfinity:
NativeCodeCall<"FloatAttr::get($0.getType().cast<TensorType>().getElementType(), "
"-std::numeric_limits<double>::infinity())">;
// Create an ArrayAttr of IntergerAttr(s) of zero values. // Create an ArrayAttr of IntergerAttr(s) of zero values.
// This function is used for padding attribute in MaxPoolSingleOut. // This function is used for padding attribute in Conv.
def createArrayAttrOfZerosFrom: def createArrayAttrOfZerosFrom:
NativeCodeCall<"createArrayAttrOfZeros($_builder, $0)">; NativeCodeCall<"createArrayAttrOfZeros($_builder, $0)">;
@ -53,7 +48,7 @@ def createArrayAttrOfZerosFrom:
// |_____| |_____| // |_____| |_____|
// nZeros nZeros // nZeros nZeros
// //
// This function is used for padding attribute in MaxPoolSingleOut. // This function is used for padding attribute in Conv.
class insertZerosForNonPaddedDims<int extensionLength>: class insertZerosForNonPaddedDims<int extensionLength>:
NativeCodeCall<"insertZerosForNonPaddedDims($_builder, $0," NativeCodeCall<"insertZerosForNonPaddedDims($_builder, $0,"
# extensionLength # ")">; # extensionLength # ")">;
@ -66,37 +61,6 @@ def HasNonZeroInArrayAttr: Constraint<CPred<"hasNonZeroInArrayAttr($_self)">,
class IsNotStringAttrOfValue<string val>: class IsNotStringAttrOfValue<string val>:
Constraint<CPred<"$0.cast<StringAttr>().getValue() != \"" # val # "\"">>; Constraint<CPred<"$0.cast<StringAttr>().getValue() != \"" # val # "\"">>;
//===----------------------------------------------------------------------===//
// Rewrite:
// %0 = onnx.MaxPoolSingleOutOp(%D : tensor<DShape>)
// {pads = [b0, b1, ... bK, e0, e1, ..., eK]} ->
// tensor<OutShape>
//
// as:
// %0 = onnx.PadConstantValuePadOp(%D)
// {pads = [0, 0, b0, b1, ... bK, 0, 0, e0, e1, ..., eK]} ->
// tensor<DPaddedShape>
// %1 = onnx.MaxPoolSingleOut(%0 : tensor<DPaddedShape>) {pads = [0, ..., 0]} ->
// tensor<OutShape>
//===----------------------------------------------------------------------===//
def MaxPoolSingleOutOpPaddingPattern: Pat<
(ONNXMaxPoolSingleOutOp:$res
$x,
$auto_pad, $ceil_mode, $dilation, $kernel_shape,
$pads,
$storage_order, $strides),
(ONNXMaxPoolSingleOutOp
(ONNXPadConstantValuePadOp $x,
(insertZerosForNonPaddedDims<2> $pads),
(FloatAttrOfNegativeInfinity $res),
(StringAttrOfValue<"constant">)),
$auto_pad, $ceil_mode, $dilation, $kernel_shape,
(createArrayAttrOfZerosFrom $pads),
$storage_order, $strides),
[(HasNonZeroInArrayAttr:$pads), (IsNotStringAttrOfValue<"VALID"> $auto_pad)]
>;
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// Rewrite: // Rewrite:
// %0 = onnx.ConvOp(%D : tensor<DShape>, %K) // %0 = onnx.ConvOp(%D : tensor<DShape>, %K)

View File

@ -327,7 +327,7 @@ test_to_enable = [
"test_batchnorm_epsilon_cpu", "test_batchnorm_epsilon_cpu",
"test_batchnorm_example_cpu", "test_batchnorm_example_cpu",
# Pooling # MaxPoolSingleOut
"test_maxpool_1d_default_cpu", "test_maxpool_1d_default_cpu",
"test_maxpool_2d_ceil_cpu", "test_maxpool_2d_ceil_cpu",
"test_maxpool_2d_default_cpu", "test_maxpool_2d_default_cpu",
@ -341,6 +341,21 @@ test_to_enable = [
"test_maxpool_2d_strides_cpu", "test_maxpool_2d_strides_cpu",
"test_maxpool_3d_default_cpu", "test_maxpool_3d_default_cpu",
# AveragePool
"test_averagepool_1d_default_cpu",
"test_averagepool_2d_ceil_cpu",
"test_averagepool_2d_default_cpu",
"test_averagepool_2d_pads_count_include_pad_cpu",
"test_averagepool_2d_pads_cpu",
"test_averagepool_2d_precomputed_pads_count_include_pad_cpu",
"test_averagepool_2d_precomputed_pads_cpu",
"test_averagepool_2d_precomputed_same_upper_cpu",
"test_averagepool_2d_precomputed_strides_cpu",
"test_averagepool_2d_same_lower_cpu",
"test_averagepool_2d_same_upper_cpu",
"test_averagepool_2d_strides_cpu",
"test_averagepool_3d_default_cpu",
] ]
# Extract name of all test cases. # Extract name of all test cases.

View File

@ -94,27 +94,3 @@ func @test_gemm_add_fusion_rank3(%arg0: tensor<128x128x256xf32>, %arg1: tensor<1
// return [[GEMM]] : tensor<*xf32> // return [[GEMM]] : tensor<*xf32>
} }
// -----
//CHECK-LABEL: @test_maxpoolsingleout_split(%{{.*}}: tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32> {
func @test_maxpoolsingleout_split(%arg0: tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", ceil_mode = 0, kernel_shape = [5,3], pads = [1, 2, 3, 4] } : (tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32>
"std.return"(%0) : (tensor<5x5x36x38xf32>) -> ()
// CHECK-NEXT: %0 = "onnx.PadConstantValuePad"(%arg0) {constant_value = 0xFF800000 : f32, mode = "constant", pads = [0, 0, 1, 2, 0, 0, 3, 4]} : (tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32>
// CHECK-NEXT: %1 = "onnx.MaxPoolSingleOut"(%0) {auto_pad = "NOTSET", ceil_mode = 0 : i64, kernel_shape = [5, 3], pads = [0, 0, 0, 0], storage_order = 0 : i64} : (tensor<5x5x36x38xf32>) -> tensor<5x5x36x38xf32>
// CHECK-NEXT: return %1 : tensor<5x5x36x38xf32>
}
// -----
//CHECK-LABEL: @test_maxpoolsingleout_split_unknown_dims(%{{.*}}: tensor<*xf32>) -> tensor<*xf32> {
func @test_maxpoolsingleout_split_unknown_dims(%arg0: tensor<*xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", ceil_mode = 0, kernel_shape = [5,3], pads = [1, 2, 3, 4] } : (tensor<*xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-NEXT: %0 = "onnx.PadConstantValuePad"(%arg0) {constant_value = 0xFF800000 : f32, mode = "constant", pads = [0, 0, 1, 2, 0, 0, 3, 4]} : (tensor<*xf32>) -> tensor<*xf32>
// CHECK-NEXT: %1 = "onnx.MaxPoolSingleOut"(%0) {auto_pad = "NOTSET", ceil_mode = 0 : i64, kernel_shape = [5, 3], pads = [0, 0, 0, 0], storage_order = 0 : i64} : (tensor<*xf32>) -> tensor<*xf32>
// CHECK-NEXT: return %1 : tensor<*xf32>
}

View File

@ -1505,231 +1505,6 @@ func @test_batchnorm_testmode_1d(%arg0: tensor<10xf32>, %arg1: tensor<1xf32>, %a
// ----- // -----
func @test_maxpooling_singleout_no_pad(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: test_maxpooling_singleout_no_pad
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
// CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 31, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 31) {
// CHECK: [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32
// CHECK: store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 2, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 2) {
// CHECK: [[H:%.+]] = addi %arg3, %arg5 : index
// CHECK: [[W:%.+]] = addi %arg4, %arg6 : index
// CHECK: [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32>
// CHECK: [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: [[COMPARE:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: [[SELECT:%.+]] = select [[COMPARE]], [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
// CHECK: }
// CHECK: return [[RES]] : memref<1x3x31x31xf32>
}
// -----
func @test_maxpooling_singleout_no_pad_w_strides(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2], strides = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x16x16xf32>
// CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 16, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 16) {
// CHECK: [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32
// CHECK: store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32>
// CHECK: [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 2, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 2) {
// CHECK: [[STRIDE_0:%.+]] = constant 2 : index
// CHECK: [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index
// CHECK: [[H:%.+]] = addi [[MUL_0]], %arg5 : index
// CHECK: [[STRIDE_1:%.+]] = constant 2 : index
// CHECK: [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg4 : index
// CHECK: [[W:%.+]] = addi [[MUL_1]], %arg6 : index
// CHECK: [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32>
// CHECK: [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32>
// CHECK: [[COMPARE:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: [[SELECT:%.+]] = select [[COMPARE]], [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32>
// CHECK: }
// CHECK: }
// CHECK: return [[RES]] : memref<1x3x16x16xf32>
}
// -----
func @test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [3, 3], strides = [2, 2], ceil_mode = 1} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x16x16xf32>
// CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 16, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 16) {
// CHECK: [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32
// CHECK: store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32>
// CHECK: [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 3, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 3) {
// CHECK: [[STRIDE_0:%.+]] = constant 2 : index
// CHECK: [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index
// CHECK: [[SPATIAL_H:%.+]] = addi [[MUL_0]], %arg5 : index
// CHECK: [[UPPER_INDEX_0:%.+]] = constant 31 : index
// CHECK: [[GREATER_THAN_UPPER_0:%.+]] = cmpi "sgt", [[SPATIAL_H]], [[UPPER_INDEX_0]] : index
// CHECK: [[H:%.+]] = select [[GREATER_THAN_UPPER_0]], [[UPPER_INDEX_0]], [[SPATIAL_H]] : index
// CHECK: [[STRIDE_1:%.+]] = constant 2 : index
// CHECK: [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg4 : index
// CHECK: [[SPATIAL_W:%.+]] = addi [[MUL_1]], %arg6 : index
// CHECK: [[UPPER_INDEX_1:%.+]] = constant 31 : index
// CHECK: [[GREATER_THAN_UPPER_1:%.+]] = cmpi "sgt", [[SPATIAL_W]], [[UPPER_INDEX_1]] : index
// CHECK: [[W:%.+]] = select [[GREATER_THAN_UPPER_1]], [[UPPER_INDEX_1]], [[SPATIAL_W]] : index
// CHECK: [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32>
// CHECK: [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32>
// CHECK: [[CMP_2:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: [[SELECT:%.+]] = select [[CMP_2]], [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32>
// CHECK: }
// CHECK: }
// CHECK: return [[RES]] : memref<1x3x16x16xf32>
}
// -----
func @test_maxpooling_singleout_no_pad_w_strides_w_dilation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [3, 3], strides = [2, 2], dilations = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides_w_dilation
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x14x14xf32>
// CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 14, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 14) {
// CHECK: [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32
// CHECK: store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x14x14xf32>
// CHECK: [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 3, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 3) {
// CHECK: [[STRIDE_0:%.+]] = constant 2 : index
// CHECK: [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index
// CHECK: [[STRIDE_1:%.+]] = constant 2 : index
// CHECK: [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg5 : index
// CHECK: [[SPATIAL_H:%.+]] = addi [[MUL_0]], [[MUL_1]] : index
// CHECK: [[UPPER_INDEX_0:%.+]] = constant 31 : index
// CHECK: [[GREATER_THAN_UPPER_0:%.+]] = cmpi "sgt", [[SPATIAL_H]], [[UPPER_INDEX_0]] : index
// CHECK: [[H:%.+]] = select [[GREATER_THAN_UPPER_0]], [[UPPER_INDEX_0]], [[SPATIAL_H]] : index
// CHECK: [[STRIDE_0_1:%.+]] = constant 2 : index
// CHECK: [[MUL_0_1:%.+]] = muli [[STRIDE_0_1]], %arg4 : index
// CHECK: [[STRIDE_1_1:%.+]] = constant 2 : index
// CHECK: [[MUL_1_1:%.+]] = muli [[STRIDE_1_1]], %arg6 : index
// CHECK: [[SPATIAL_W:%.+]] = addi [[MUL_0_1]], [[MUL_1_1]] : index
// CHECK: [[UPPER_INDEX_1:%.+]] = constant 31 : index
// CHECK: [[GREATER_THAN_UPPER_1:%.+]] = cmpi "sgt", [[SPATIAL_W]], [[UPPER_INDEX_1]] : index
// CHECK: [[W:%.+]] = select [[GREATER_THAN_UPPER_1]], [[UPPER_INDEX_1]], [[SPATIAL_W]] : index
// CHECK: [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32>
// CHECK: [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x14x14xf32>
// CHECK: [[CMP_2:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: [[SELECT:%.+]] = select [[CMP_2]], [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x14x14xf32>
// CHECK: }
// CHECK: }
// CHECK: return [[RES]] : memref<1x3x14x14xf32>
}
// -----
func @test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode_w_unknown_dims(%arg0 : tensor<?x3x?x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [3, 3], strides = [2, 2], ceil_mode = 1} : (tensor<?x3x?x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode_w_unknown_dims
// CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x3x?x32xf32>
// CHECK: [[ZERO:%.+]] = constant 0 : i64
// CHECK: [[ONE:%.+]] = constant 1 : i64
// CHECK: [[DIM_1:%.+]] = dim %arg0, 2 : memref<?x3x?x32xf32>
// CHECK: [[DIM_1_i64:%.+]] = index_cast [[DIM_1]] : index to i64
// CHECK: [[KERNEL_PAD_DILATION:%.+]] = constant -3 : i64
// CHECK: [[NUMERATOR:%.+]] = addi [[DIM_1_i64]], [[KERNEL_PAD_DILATION]] : i64
// CHECK: [[DENOMINATOR:%.+]] = constant 2 : i64
// CHECK: [[DIV:%.+]] = divi_signed [[NUMERATOR]], [[DENOMINATOR]] : i64
// CHECK: [[REMAINDER:%.+]] = remi_signed [[NUMERATOR]], [[DENOMINATOR]] : i64
// CHECK: [[IS_ZERO:%.+]] = cmpi "eq", [[REMAINDER]], [[ZERO]] : i64
// CHECK: [[DIV_PLUS_ONE:%.+]] = addi [[DIV]], [[ONE]] : i64
// CHECK: [[SELECT:%.+]] = select [[IS_ZERO]], [[DIV]], [[DIV_PLUS_ONE]] : i64
// CHECK: [[SELECT_PLUS_ONE:%.+]] = addi [[SELECT]], [[ONE]] : i64
// CHECK: [[DIM_1_FINAL:%.+]] = index_cast [[SELECT_PLUS_ONE]] : i64 to index
// CHECK: [[RES:%.+]] = alloc([[DIM_0]], [[DIM_1_FINAL]]) : memref<?x3x?x16xf32>
// CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x3x?x16xf32>
// CHECK: [[DIM_3:%.+]] = dim [[RES]], 2 : memref<?x3x?x16xf32>
// CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to [[DIM_2]], [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to [[DIM_3]], [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 16) {
// CHECK: [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32
// CHECK: store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<?x3x?x16xf32>
// CHECK: [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 3, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 3) {
// CHECK: [[STRIDE_0:%.+]] = constant 2 : index
// CHECK: [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index
// CHECK: [[SPATIAL_H:%.+]] = addi [[MUL_0]], %arg5 : index
// CHECK: [[DIM_0_0:%.+]] = dim %arg0, 2 : memref<?x3x?x32xf32>
// CHECK: [[ONE_INDEX:%.+]] = constant 1 : index
// CHECK: [[UPPER_INDEX_0:%.+]] = subi [[DIM_0_0]], [[ONE_INDEX]] : index
// CHECK: [[GREATER_THAN_UPPER_0:%.+]] = cmpi "sgt", [[SPATIAL_H]], [[UPPER_INDEX_0]] : index
// CHECK: [[H:%.+]] = select [[GREATER_THAN_UPPER_0]], [[UPPER_INDEX_0]], [[SPATIAL_H]] : index
// CHECK: [[STRIDE_1:%.+]] = constant 2 : index
// CHECK: [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg4 : index
// CHECK: [[SPATIAL_W:%.+]] = addi [[MUL_1]], %arg6 : index
// CHECK: [[UPPER_INDEX_1:%.+]] = constant 31 : index
// CHECK: [[GREATER_THAN_UPPER_1:%.+]] = cmpi "sgt", [[SPATIAL_W]], [[UPPER_INDEX_1]] : index
// CHECK: [[W:%.+]] = select [[GREATER_THAN_UPPER_1]], [[UPPER_INDEX_1]], [[SPATIAL_W]] : index
// CHECK: [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<?x3x?x32xf32>
// CHECK: [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<?x3x?x16xf32>
// CHECK: [[CMP_2:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: [[SELECT:%.+]] = select [[CMP_2]], [[LOAD_Y]], [[LOAD_X]] : f32
// CHECK: store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<?x3x?x16xf32>
// CHECK: }
// CHECK: }
// CHECK: return [[RES]] : memref<?x3x?x16xf32>
}
// -----
func @test_abs_float(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> { func @test_abs_float(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
%0 = "onnx.Abs"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32> %0 = "onnx.Abs"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> () "std.return"(%0) : (tensor<*xf32>) -> ()
@ -1810,7 +1585,6 @@ func @test_constant_dense_2d_value(%arg0: tensor<1xf32>) -> tensor<*xf32> {
// CHECK: return [[RES]] : memref<3x2xf32> // CHECK: return [[RES]] : memref<3x2xf32>
} }
// ----- // -----
func @test_concat_1(%arg0 : tensor<5x5x1x32xf32>, %arg1 : tensor<5x5x3x32xf32>, %arg2 : tensor<5x5x5x32xf32>) -> tensor<5x5x9x32xf32> { func @test_concat_1(%arg0 : tensor<5x5x1x32xf32>, %arg1 : tensor<5x5x3x32xf32>, %arg2 : tensor<5x5x5x32xf32>) -> tensor<5x5x9x32xf32> {
@ -1849,3 +1623,133 @@ func @test_concat_1(%arg0 : tensor<5x5x1x32xf32>, %arg1 : tensor<5x5x3x32xf32>,
// CHECK: return [[RES]] : memref<5x5x9x32xf32> // CHECK: return [[RES]] : memref<5x5x9x32xf32>
} }
// -----
func @test_pool_general_computation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> ((s2 ceildiv s4) * s4 - s2, d0 * s3 - s2)>
// CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0, d0 * s3 + (s1 - 1) * s4 - s2 + 1)>
// CHECK-DAG: #{{.*}} = affine_map<() -> (0)>
// CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0 - ((s2 ceildiv s4) * s4 - s2), -(d0 * s3 - s2) + s0, d0 * s3 + (s1 - 1) * s4 - s2 - ((s2 ceildiv s4) * s4 - s2) + 1, d0 * s3 + (s1 - 1) * s4 - s2 - (d0 * s3 - s2) + 1)>
// CHECK-LABEL: @test_pool_general_computation
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
// CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
// CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_OUTPUT_LOOPS:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_OUTPUT_LOOPS]]#0, [[OPT_OUTPUT_LOOPS]]#1, [[OPT_OUTPUT_LOOPS]]#2, [[OPT_OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
// CHECK: store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_POOL_LOOPS:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_POOL_LOOPS]]#0, [[OPT_POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #map3(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #map3(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
// CHECK: {{.*}} = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
// CHECK: {{.*}} = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
// CHECK: {{.*}} = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
}
// -----
func @test_averagepool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: @test_averagepool_identity_value
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
// CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
// CHECK: store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
}
// -----
func @test_maxpool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: @test_maxpool_identity_value
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
// CHECK: [[IDENTITY:%.+]] = constant 0xFF800000 : f32
// CHECK: store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
}
// -----
func @test_averagepool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: @test_averagepool_pooling_operation
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
// CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_OUTPUT_LOOPS:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_OUTPUT_LOOPS]]#0, [[OPT_OUTPUT_LOOPS]]#1, [[OPT_OUTPUT_LOOPS]]#2, [[OPT_OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
// CHECK: [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_POOL_LOOPS:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_POOL_LOOPS]]#0, [[OPT_POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #map3(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #map3(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
// CHECK: [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
// CHECK: [[OUTPUT_LOAD:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: [[SUM:%.+]] = addf [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
// CHECK: store [[SUM]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
// CHECK: [[NUMERATOR:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: [[AVERAGE:%.+]] = divf [[NUMERATOR]], {{.*}} : f32
// CHECK: store [[AVERAGE]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
}
// -----
func @test_maxpool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
%0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
"std.return"(%0) : (tensor<*xf32>) -> ()
// CHECK-LABEL: @test_maxpool_pooling_operation
// CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
// CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
// CHECK: [[OPT_OUTPUT_LOOPS:%.+]]:4 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3
// CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_OUTPUT_LOOPS]]#0, [[OPT_OUTPUT_LOOPS]]#1, [[OPT_OUTPUT_LOOPS]]#2, [[OPT_OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
// CHECK: [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
// CHECK: [[OPT_POOL_LOOPS:%.+]]:2 = krnl.optimize_loops {
// CHECK: krnl.return_loops [[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_POOL_LOOPS]]#0, [[OPT_POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #map3(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #map3(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
// CHECK: [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
// CHECK: [[OUTPUT_LOAD:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: [[GREATER:%.+]] = cmpf "ogt", [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
// CHECK: [[SELECT:%.+]] = select [[GREATER]], [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
// CHECK: store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
// CHECK-NOT: {{.*}} = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK-NOT: store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
// CHECK: }
}