Lower MaxPooling and AveragePool to Krnl dialect using AffineMap (#38)
* Create a template for pooling and add support for AveragePool * Edit MLIR tests for MaxPool according to the new lowering template for pooling * Dealloc temporary variables * Support count_include_pad for AveragePool * Add MLIR tests for AveragePool lowering * Make changes according to Tian's comments * Push AffineMap as upper bound for KrnlIterateOp * Test AffineMap to use in Pooling * Replace the old implementaion by a new one using AffineMap * Fix the computation when dilations are non-unit * Clean up the old code * Remove AveragePool from Canonicalization pass * Fix computing the end indices of a filter window * Refactor the code for pooling * Revise pushAffineMapBound * Add MLIR tests * Remove unused functions * Fix check-onnx-backend build on x86 Linux. (#91) * Add the split marker to test files (#90) Co-authored-by: Tian Jin <tjingrant@gmail.com> Co-authored-by: gongsu832 <gong_su@hotmail.com> Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
		
							parent
							
								
									e32f531546
								
							
						
					
					
						commit
						eac2297624
					
				|  | @ -8,6 +8,7 @@ | |||
| //
 | ||||
| //===----------------------------------------------------------------------===//
 | ||||
| 
 | ||||
| #include "mlir/IR/AffineExpr.h" | ||||
| #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp" | ||||
| 
 | ||||
| using namespace mlir; | ||||
|  | @ -19,6 +20,19 @@ Value getIdentityValue<ONNXMaxPoolSingleOutOp>( | |||
|   return emitNegativeInfinityConstantOp(rewriter, loc, type); | ||||
| } | ||||
| 
 | ||||
| template <> | ||||
| Value getIdentityValue<ONNXAveragePoolOp>( | ||||
|     ConversionPatternRewriter &rewriter, Location loc, Type type) { | ||||
|   return emitConstantOp(rewriter, loc, type, 0); | ||||
| } | ||||
| 
 | ||||
| // Scalar operations
 | ||||
| template <> | ||||
| struct ScalarOp<ONNXAveragePoolOp> { | ||||
|   using FOp = AddFOp; | ||||
|   using IOp = AddIOp; | ||||
| }; | ||||
| 
 | ||||
| template <> | ||||
| Value emitScalarOpFor<ONNXMaxPoolSingleOutOp>( | ||||
|     ConversionPatternRewriter &rewriter, Location loc, Operation *op, | ||||
|  | @ -30,288 +44,519 @@ Value emitScalarOpFor<ONNXMaxPoolSingleOutOp>( | |||
|   return result; | ||||
| } | ||||
| 
 | ||||
| struct ONNXMaxPoolSingleOutOpLowering : public ConversionPattern { | ||||
|   ONNXMaxPoolSingleOutOpLowering(MLIRContext *ctx) | ||||
|       : ConversionPattern( | ||||
|             mlir::ONNXMaxPoolSingleOutOp::getOperationName(), 1, ctx) {} | ||||
| //===----------------------------------------------------------------------===//
 | ||||
| // Get dilation values
 | ||||
| //
 | ||||
| template <typename PoolOp> | ||||
| std::vector<int64_t> getDilations(PoolOp poolOp) { | ||||
|   return {}; | ||||
| } | ||||
| 
 | ||||
| // MaxPool has dilations attribute.
 | ||||
| template <> | ||||
| std::vector<int64_t> getDilations<ONNXMaxPoolSingleOutOp>( | ||||
|     ONNXMaxPoolSingleOutOp poolOp) { | ||||
|   std::vector<int64_t> dilations; | ||||
|   auto dilationsAttribute = poolOp.dilationsAttr(); | ||||
|   bool isDefaultDilations = true; | ||||
|   for (auto dilation : dilationsAttribute.getValue()) { | ||||
|     int64_t dilationValue = dilation.cast<IntegerAttr>().getInt(); | ||||
|     if (dilationValue > 1 and isDefaultDilations) | ||||
|       isDefaultDilations = false; | ||||
|     dilations.emplace_back(dilationValue); | ||||
|   } | ||||
|   if (isDefaultDilations) | ||||
|     return {}; | ||||
|   else | ||||
|     return dilations; | ||||
| } | ||||
| 
 | ||||
| //===----------------------------------------------------------------------===//
 | ||||
| // Get count_include_pad values
 | ||||
| //
 | ||||
| template <typename PoolOp> | ||||
| bool getCountIncludePad(PoolOp poolOp) { | ||||
|   return false; | ||||
| } | ||||
| 
 | ||||
| // AveragePool has count_include_pad attribute.
 | ||||
| template <> | ||||
| bool getCountIncludePad<ONNXAveragePoolOp>(ONNXAveragePoolOp poolOp) { | ||||
|   return (poolOp.count_include_pad() == 1); | ||||
| } | ||||
| 
 | ||||
| //===----------------------------------------------------------------------===//
 | ||||
| // Helper function to do post-processing after applying a filter window.
 | ||||
| //
 | ||||
| template <typename PoolOp> | ||||
| void postProcessPoolingWindow(ConversionPatternRewriter &rewriter, Location loc, | ||||
|     PoolOp poolOp, Value alloc, ArrayRef<Value> resultIndices, | ||||
|     ArrayRef<int64_t> kernelShape, ArrayRef<Value> poolDimValues) {} | ||||
| 
 | ||||
| // Calculate the average value for AveragePool.
 | ||||
| template <> | ||||
| void postProcessPoolingWindow<ONNXAveragePoolOp>( | ||||
|     ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp, | ||||
|     Value alloc, ArrayRef<Value> resultIndices, ArrayRef<int64_t> kernelShape, | ||||
|     ArrayRef<Value> poolDimValues) { | ||||
|   // AveragePool's result type is FloatType, so it's safe to use DivFOp, SubFOp.
 | ||||
|   bool countIncludePad = getCountIncludePad<ONNXAveragePoolOp>(poolOp); | ||||
|   Value numerator = rewriter.create<LoadOp>(loc, alloc, resultIndices); | ||||
|   Value denominator; | ||||
|   if (countIncludePad) { | ||||
|     int64_t kernelSize = 1; | ||||
|     for (int i = 0; i < kernelShape.size(); ++i) | ||||
|       kernelSize *= kernelShape[i]; | ||||
|     denominator = | ||||
|         emitConstantOp(rewriter, loc, numerator.getType(), kernelSize); | ||||
|   } else { | ||||
|     denominator = poolDimValues[0]; | ||||
|     for (int i = 1; i < poolDimValues.size(); ++i) | ||||
|       denominator = rewriter.create<MulIOp>(loc, denominator, poolDimValues[i]); | ||||
|     denominator = rewriter.create<IndexCastOp>( | ||||
|         loc, denominator, rewriter.getIntegerType(64)); | ||||
|     denominator = | ||||
|         rewriter.create<SIToFPOp>(loc, denominator, numerator.getType()); | ||||
|   } | ||||
| 
 | ||||
|   Value average = rewriter.create<DivFOp>(loc, numerator, denominator); | ||||
| 
 | ||||
|   rewriter.create<StoreOp>(loc, average, alloc, resultIndices); | ||||
| } | ||||
| 
 | ||||
| //===----------------------------------------------------------------------===//
 | ||||
| // Helper function to insert alloc and dealloc ops for memref of dynamic shape.
 | ||||
| //
 | ||||
| Value insertAllocAndDeallocForPooling(ConversionPatternRewriter &rewriter, | ||||
|     Location loc, bool insertDealloc, MemRefType memRefType, Value inputOperand, | ||||
|     ArrayRef<int64_t> kernelShape, ArrayRef<int64_t> pads, | ||||
|     ArrayRef<int64_t> strides, ArrayRef<int64_t> dilations, bool ceilMode) { | ||||
|   AllocOp alloc; | ||||
| 
 | ||||
|   // Shape and rank information related to result and kernel.
 | ||||
|   auto resultShape = memRefType.getShape(); | ||||
|   auto resultRank = resultShape.size(); | ||||
|   auto kernelRank = kernelShape.size(); | ||||
|   auto kernelOffset = resultRank - kernelRank; | ||||
| 
 | ||||
|   // Compute dimensions of the result of this operation.
 | ||||
|   SmallVector<Value, 2> allocOperands; | ||||
|   for (int i = 0; i < kernelOffset; ++i) { | ||||
|     if (resultShape[i] < 0) { | ||||
|       auto dim = rewriter.create<DimOp>(loc, inputOperand, i); | ||||
|       allocOperands.emplace_back(dim); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   Value zero, one; | ||||
|   if (ceilMode) { | ||||
|     zero = rewriter.create<ConstantOp>( | ||||
|         loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0)); | ||||
|   } | ||||
|   one = rewriter.create<ConstantOp>( | ||||
|       loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 1)); | ||||
| 
 | ||||
|   for (int i = kernelOffset; i < resultShape.size(); ++i) { | ||||
|     if (resultShape[i] < 0) { | ||||
|       // dim =
 | ||||
|       //   let numerator = (input + pad - (kernel - 1) * dilation - 1)
 | ||||
|       //   in let denominator = stride
 | ||||
|       //      in
 | ||||
|       //        if (ceilMode)
 | ||||
|       //          ceil(numerator / denominator) + 1
 | ||||
|       //        else
 | ||||
|       //          floor(numerator / denominator) + 1
 | ||||
|       int spatialIndex = i - kernelOffset; | ||||
| 
 | ||||
|       // numerator = (input + pad - (kernel - 1) * dilation - 1)
 | ||||
|       int64_t dilation = dilations.empty() ? 1 : dilations[spatialIndex]; | ||||
|       int64_t padKernelDilation = | ||||
|           (pads[spatialIndex] + pads[spatialIndex + kernelRank]) - | ||||
|           (kernelShape[spatialIndex] - 1) * dilation - 1; | ||||
|       auto padKernelDilationVal = emitConstantOp( | ||||
|           rewriter, loc, rewriter.getIntegerType(64), padKernelDilation); | ||||
|       auto inputDim = rewriter.create<DimOp>(loc, inputOperand, i); | ||||
|       auto inputDimVal = rewriter.create<IndexCastOp>( | ||||
|           loc, inputDim, rewriter.getIntegerType(64)); | ||||
|       auto numeratorVal = | ||||
|           rewriter.create<AddIOp>(loc, inputDimVal, padKernelDilationVal); | ||||
|       // denominator
 | ||||
|       auto denominatorVal = emitConstantOp( | ||||
|           rewriter, loc, rewriter.getIntegerType(64), strides[spatialIndex]); | ||||
| 
 | ||||
|       // numerator / denominator
 | ||||
|       Value dimVal = | ||||
|           rewriter.create<SignedDivIOp>(loc, numeratorVal, denominatorVal); | ||||
| 
 | ||||
|       if (ceilMode) { | ||||
|         auto remainder = | ||||
|             rewriter.create<SignedRemIOp>(loc, numeratorVal, denominatorVal); | ||||
|         auto isZero = | ||||
|             rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, remainder, zero); | ||||
|         auto dimPlusOne = rewriter.create<AddIOp>(loc, dimVal, one); | ||||
|         dimVal = rewriter.create<SelectOp>(loc, isZero, dimVal, dimPlusOne); | ||||
|       } | ||||
| 
 | ||||
|       dimVal = rewriter.create<AddIOp>(loc, dimVal, one); | ||||
|       allocOperands.emplace_back( | ||||
|           rewriter.create<IndexCastOp>(loc, dimVal, rewriter.getIndexType())); | ||||
|     } | ||||
|   } | ||||
|   alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands); | ||||
|   if (insertDealloc) { | ||||
|     auto *parentBlock = alloc.getOperation()->getBlock(); | ||||
|     auto dealloc = rewriter.create<DeallocOp>(loc, alloc); | ||||
|     dealloc.getOperation()->moveBefore(&parentBlock->back()); | ||||
|   } | ||||
|   return alloc; | ||||
| } | ||||
| 
 | ||||
| //===----------------------------------------------------------------------===//
 | ||||
| // Template function that does pooling.
 | ||||
| //
 | ||||
| template <typename PoolOp> | ||||
| struct ONNXPoolOpLowering : public ConversionPattern { | ||||
|   ONNXPoolOpLowering(MLIRContext *ctx) | ||||
|       : ConversionPattern(PoolOp::getOperationName(), 1, ctx) {} | ||||
| 
 | ||||
|   LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands, | ||||
|       ConversionPatternRewriter &rewriter) const final { | ||||
|     ONNXMaxPoolSingleOutOpOperandAdaptor operandAdaptor(operands); | ||||
|     auto loc = op->getLoc(); | ||||
| 
 | ||||
|     // Match
 | ||||
|     ONNXMaxPoolSingleOutOp poolOp = llvm::dyn_cast<ONNXMaxPoolSingleOutOp>(op); | ||||
|     PoolOp poolOp = llvm::dyn_cast<PoolOp>(op); | ||||
| 
 | ||||
|     // Read kernel_shape attribute
 | ||||
|     SmallVector<int, 4> kernelShape; | ||||
|     SmallVector<int64_t, 4> kernelShape; | ||||
|     auto kernelShapeAttribute = poolOp.kernel_shapeAttr(); | ||||
|     for (auto dim : kernelShapeAttribute.getValue()) | ||||
|     for (Attribute dim : kernelShapeAttribute.getValue()) | ||||
|       kernelShape.emplace_back(dim.cast<IntegerAttr>().getInt()); | ||||
| 
 | ||||
|     // Read strides attribute
 | ||||
|     SmallVector<int, 4> strides; | ||||
|     SmallVector<int64_t, 4> strides; | ||||
|     auto stridesAttribute = poolOp.stridesAttr(); | ||||
|     for (auto stride : stridesAttribute.getValue()) | ||||
|     for (Attribute stride : stridesAttribute.getValue()) | ||||
|       strides.emplace_back(stride.cast<IntegerAttr>().getInt()); | ||||
| 
 | ||||
|     // Read ceil_mode attribute
 | ||||
|     auto ceilMode = poolOp.ceil_mode().getSExtValue(); | ||||
| 
 | ||||
|     // Read pads attribute
 | ||||
|     SmallVector<int, 4> pads; | ||||
|     SmallVector<int64_t, 4> pads; | ||||
|     auto padsAttribute = poolOp.padsAttr(); | ||||
|     for (auto pad : padsAttribute.getValue()) | ||||
|     for (Attribute pad : padsAttribute.getValue()) | ||||
|       pads.emplace_back(pad.cast<IntegerAttr>().getInt()); | ||||
| 
 | ||||
|     // Read dilations attribute
 | ||||
|     SmallVector<int, 4> dilations; | ||||
|     auto dilationsAttribute = poolOp.dilationsAttr(); | ||||
|     for (auto dilation : dilationsAttribute.getValue()) | ||||
|       dilations.emplace_back(dilation.cast<IntegerAttr>().getInt()); | ||||
|     // Read dilations attribute if the op has.
 | ||||
|     std::vector<int64_t> dilations = getDilations<PoolOp>(poolOp); | ||||
|     bool isDilated = !dilations.empty(); | ||||
| 
 | ||||
|     // Type information about the input and result of this operation.
 | ||||
|     auto inputOperand = operandAdaptor.X(); | ||||
|     auto inputShape = inputOperand.getType().cast<MemRefType>().getShape(); | ||||
|     auto memRefType = convertToMemRefType(*op->result_type_begin()); | ||||
|     auto resultShape = memRefType.getShape(); | ||||
|     auto resultElementType = memRefType.getElementType(); | ||||
|     auto outputShape = memRefType.getShape(); | ||||
|     auto outputElementType = memRefType.getElementType(); | ||||
| 
 | ||||
|     // Batch indices: N and C dimensions
 | ||||
|     int batchRank = 2; | ||||
|     // Kernel offset in the input shape.
 | ||||
|     int kernelOffset = inputShape.size() - kernelShape.size(); | ||||
| 
 | ||||
|     // Insert an allocation and deallocation for the result of this operation.
 | ||||
|     // Insert an allocation and deallocation for the output of this operation.
 | ||||
|     Value alloc; | ||||
|     bool insertDealloc = checkInsertDealloc(op); | ||||
| 
 | ||||
|     if (hasAllConstantDimensions(memRefType)) | ||||
|       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc); | ||||
|     else { | ||||
|       // Compute dimensions of the result of this operation.
 | ||||
|       SmallVector<Value, 2> allocOperands; | ||||
|       for (int i = 0; i < batchRank; ++i) { | ||||
|         if (resultShape[i] < 0) { | ||||
|           auto dim = rewriter.create<DimOp>(loc, inputOperand, i); | ||||
|           allocOperands.emplace_back(dim); | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|       Value zero, one; | ||||
|       if (ceilMode) { | ||||
|         zero = rewriter.create<ConstantOp>( | ||||
|             loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0)); | ||||
|       } | ||||
|       one = rewriter.create<ConstantOp>( | ||||
|           loc, rewriter.getIntegerAttr(rewriter.getIntegerType(64), 1)); | ||||
| 
 | ||||
|       int spatialRank = resultShape.size() - batchRank; | ||||
|       for (int i = batchRank; i < resultShape.size(); ++i) { | ||||
|         if (resultShape[i] < 0) { | ||||
|           // dim =
 | ||||
|           //   let numerator = (input + pad - (kernel - 1) * dilation - 1)
 | ||||
|           //   in let denomitor = stride
 | ||||
|           //      in
 | ||||
|           //        if (ceilMode)
 | ||||
|           //          ceil(numerator / denominator) + 1
 | ||||
|           //        else
 | ||||
|           //          floor(numerator / denominator) + 1
 | ||||
|           int spatialIndex = i - batchRank; | ||||
| 
 | ||||
|           // numerator = (input + pad - (kernel - 1) * dilation - 1)
 | ||||
|           auto inputDim = rewriter.create<DimOp>(loc, inputOperand, i); | ||||
|           auto inputVal = rewriter.create<IndexCastOp>( | ||||
|               loc, inputDim, rewriter.getIntegerType(64)); | ||||
|           int64_t padKernelDilation = | ||||
|               (pads[spatialIndex] + pads[spatialIndex + spatialRank]) - | ||||
|               (kernelShape[spatialIndex] - 1) * dilations[spatialIndex] - 1; | ||||
|           auto padKernelDilationVal = rewriter.create<ConstantOp>( | ||||
|               loc, rewriter.getIntegerAttr( | ||||
|                        rewriter.getIntegerType(64), padKernelDilation)); | ||||
|           auto numeratorVal = | ||||
|               rewriter.create<AddIOp>(loc, inputVal, padKernelDilationVal); | ||||
|           // denominator
 | ||||
|           auto denominatorVal = rewriter.create<ConstantOp>( | ||||
|               loc, rewriter.getIntegerAttr( | ||||
|                        rewriter.getIntegerType(64), strides[spatialIndex])); | ||||
| 
 | ||||
|           // numerator / denominator
 | ||||
|           Value dimVal = | ||||
|               rewriter.create<SignedDivIOp>(loc, numeratorVal, denominatorVal); | ||||
| 
 | ||||
|           if (ceilMode) { | ||||
|             auto remainder = rewriter.create<SignedRemIOp>( | ||||
|                 loc, numeratorVal, denominatorVal); | ||||
|             auto isZero = rewriter.create<CmpIOp>( | ||||
|                 loc, CmpIPredicate::eq, remainder, zero); | ||||
|             auto dimPlusOne = rewriter.create<AddIOp>(loc, dimVal, one); | ||||
|             dimVal = rewriter.create<SelectOp>(loc, isZero, dimVal, dimPlusOne); | ||||
|           } | ||||
| 
 | ||||
|           dimVal = rewriter.create<AddIOp>(loc, dimVal, one); | ||||
|           allocOperands.emplace_back(rewriter.create<IndexCastOp>( | ||||
|               loc, dimVal, rewriter.getIndexType())); | ||||
|         } | ||||
|       } | ||||
|       alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands); | ||||
|       if (insertDealloc) { | ||||
|         auto *parentBlock = alloc.getDefiningOp()->getBlock(); | ||||
|         auto dealloc = rewriter.create<DeallocOp>(loc, alloc); | ||||
|         dealloc.getOperation()->moveBefore(&parentBlock->back()); | ||||
|       } | ||||
|       alloc = insertAllocAndDeallocForPooling(rewriter, loc, insertDealloc, | ||||
|           memRefType, inputOperand, kernelShape, pads, strides, dilations, | ||||
|           ceilMode); | ||||
|     } | ||||
| 
 | ||||
|     // R = MaxPool(D)
 | ||||
|     // input = Pool(output)
 | ||||
|     //
 | ||||
|     // The input/output shapes will look like this:
 | ||||
|     //
 | ||||
|     // D (NxCxHxW) -> R (NxCxRHxRW)
 | ||||
|     // input (NxCxHxW) -> output (NxCxHOxWO)
 | ||||
|     //
 | ||||
|     // The loop nest will look as follows:
 | ||||
|     //
 | ||||
|     // strides = [s1, s2]
 | ||||
|     // kernelShape = [kH, kW]
 | ||||
|     // pads = [ptH, ptW, pbH, pbW]
 | ||||
|     // strides = [sH, sW]
 | ||||
|     // dilations = [dH, dW]
 | ||||
|     // round = ceil if ceilMode else floor
 | ||||
|     //
 | ||||
|     // for n = 0 .. N:
 | ||||
|     //   for c = 0 .. C:
 | ||||
|     //     for r1 = 0 .. RH:
 | ||||
|     //       for r2 = 0 .. RW:
 | ||||
|     //         R[n][c][r1][r2] = negative_infinity;
 | ||||
|     //         for k1 = 0 .. KH:
 | ||||
|     //           for k2 = 0 .. KW:
 | ||||
|     //             t = D[n][c][s1 * r1 + k1 * d1][s2 * r2 + k2 * d2];
 | ||||
|     //             R[n][c][r1][r2] = max(R[n][c][r1][r2], t);
 | ||||
|     // for n in range(N):
 | ||||
|     //   for c in range(C):
 | ||||
|     //     for ho in range(HO):
 | ||||
|     //       for wo in range(WO):
 | ||||
|     //         # Initialize values for the output.
 | ||||
|     //         output[n][c][ho][wo] = getIdentityValue(...);
 | ||||
|     //
 | ||||
|     // Naming:
 | ||||
|     //   n, c, r1, r2: outer loop nest indices
 | ||||
|     //   k1, k2: inner loop nest indices
 | ||||
|     //   s1, s2: strides
 | ||||
|     //   d1, d2: dilations
 | ||||
|     //         # Thanks to Tian (@tjingrant) for the following derivation about
 | ||||
|     //         # firstValid.
 | ||||
|     //         # When dilation is non-unit, the first valid pixel to
 | ||||
|     //         # apply pooling on will not be the 0-th pixel, but rather
 | ||||
|     //         # the smallest integer n to make -pH + n * 3 greater than
 | ||||
|     //         # or equal to 0.
 | ||||
|     //         # We derive what is this smallest n:
 | ||||
|     //         # -pH + n * dH >= 0
 | ||||
|     //         #       n * dH >= pH
 | ||||
|     //         #            n >= pH/dH
 | ||||
|     //         # thus n = ceil(pH/dH)
 | ||||
|     //         # thus the first valid pixel location is
 | ||||
|     //         # ceil(pH / dilation) * dilation - pH
 | ||||
|     //
 | ||||
|     // TODO: handle padding.
 | ||||
|     //         firstValidH = ceil(float(ptH / dH)) * dH - ptH
 | ||||
|     //         startH = max(firstValidH, ho * sH - ptH)
 | ||||
|     //         endH = min(H, ho * sH + (kH -1) * dH  + 1 - ptH)
 | ||||
|     //
 | ||||
|     //         firstValidW= ceil(float(pW / dW)) * dW - ptW
 | ||||
|     //         startW = max(firstValidW, wo * sW - ptW)
 | ||||
|     //         endW = min(W, wo * sW + (kW - 1) * dW + 1 - ptW)
 | ||||
|     //
 | ||||
|     //         hDim= round(float(endH - startH) / float(dH))
 | ||||
|     //         wDim= round(float(endW - startW) / float(dW))
 | ||||
|     //
 | ||||
|     //         # Apply the pooling window.
 | ||||
|     //         # The pooling window can be smaller than the kernel when slicing
 | ||||
|     //         # over the border edges.
 | ||||
|     //         for hi in range(startH, endH, dH):
 | ||||
|     //           for wi in range(startW, endW, dW):
 | ||||
|     //             output[n, c, ho, wo] = emitScalarOpFor(output[n, c, ho, wo],
 | ||||
|     //                                                    input[n, c, hi, wi]);
 | ||||
|     //
 | ||||
|     //         # The above two for-loops are rewritten as follows:
 | ||||
|     //         # (since KrnlIterateOp has not supported `step` yet)
 | ||||
|     //         for hp in range(hDim):
 | ||||
|     //           for wp in range(wDim):
 | ||||
|     //             hi = hp * dH + startH
 | ||||
|     //             wi = wp * dW + startW
 | ||||
|     //             output[n, c, ho, wo] = emitScalarOpFor(output[n, c, ho, wo],
 | ||||
|     //                                                    input[n, c, hi, wi]);
 | ||||
|     //
 | ||||
|     //         # Do post processing such as taking average pooling:
 | ||||
|     //         postProcessPoolingWindow(...)
 | ||||
|     //
 | ||||
|     // Helper functions:
 | ||||
|     //   getIdentityValue(): to return the indentity value
 | ||||
|     //     - negative infinity for MaxPool
 | ||||
|     //     - 0 for AveragePool
 | ||||
|     //   emitScalarOpFor(): to do primitive computation for Pooling, e.g.
 | ||||
|     //     - compute max for MaxPool
 | ||||
|     //     - compute sum for AveragePool
 | ||||
|     //   postProcessPoolingWindow(): to do post processing over the whole
 | ||||
|     //   pooling window, e.g.
 | ||||
|     //     - do nothing in case of MaxPool
 | ||||
|     //     - calculate the average in case of AveragePool, e.g.
 | ||||
|     //         if hDim * wDim> 0:
 | ||||
|     //           output[n, c, ho, wo] = output[n, c, ho, wo] / (hDim*wDim)
 | ||||
|     //
 | ||||
| 
 | ||||
|     // 1. Define outer loops and emit empty optimization block.
 | ||||
|     auto nOuterLoops = resultShape.size(); | ||||
|     BuildKrnlLoop outerLoops(rewriter, loc, nOuterLoops); | ||||
|     outerLoops.createDefineOptimizeAndIterateOp(alloc); | ||||
|     // Identity value of the operation.
 | ||||
|     auto identity = getIdentityValue<PoolOp>(rewriter, loc, outputElementType); | ||||
| 
 | ||||
|     rewriter.setInsertionPointToStart(outerLoops.getIterateBlock()); | ||||
|     // 1. Define output loops to compute one output pixel.
 | ||||
|     // for n in range(N):
 | ||||
|     //   for c in range(C):
 | ||||
|     //     for ho in range(HO):
 | ||||
|     //       for wo in range(WO):
 | ||||
|     BuildKrnlLoop outputLoops(rewriter, loc, outputShape.size()); | ||||
|     outputLoops.createDefineOptimizeAndIterateOp(alloc); | ||||
| 
 | ||||
|     auto ipMainRegion = rewriter.saveInsertionPoint(); | ||||
|     rewriter.setInsertionPointToStart(outputLoops.getIterateBlock()); | ||||
|     { | ||||
|       // 2. Emit the body of the outer loop nest.
 | ||||
|       SmallVector<Value, 4> resultIndices; | ||||
|       for (int i = 0; i < nOuterLoops; ++i) | ||||
|         resultIndices.emplace_back(outerLoops.getInductionVar(i)); | ||||
|       // 2. Emit the body of the output loop nest, which applies a pooling
 | ||||
|       // window to a region in the input, producing one output pixel.
 | ||||
|       SmallVector<Value, 4> outputIndices; | ||||
|       for (int i = 0; i < outputShape.size(); ++i) | ||||
|         outputIndices.emplace_back(outputLoops.getInductionVar(i)); | ||||
| 
 | ||||
|       // 2.1 Emit: R[n][c][r1][r2] = negative_infinity;
 | ||||
|       Value identity = getIdentityValue<ONNXMaxPoolSingleOutOp>( | ||||
|           rewriter, loc, resultElementType); | ||||
|       rewriter.create<StoreOp>(loc, identity, alloc, resultIndices); | ||||
|       // 2.1 Emit: output[n][c][ho][wo] = identity
 | ||||
|       rewriter.create<StoreOp>(loc, identity, alloc, outputIndices); | ||||
| 
 | ||||
|       // 2.2 Define inner loops.
 | ||||
|       int nInnerLoops = kernelShape.size(); | ||||
|       BuildKrnlLoop innerLoops(rewriter, loc, nInnerLoops); | ||||
|       innerLoops.createDefineAndOptimizeOp(); | ||||
|       //   for Kx = 0 .. KX
 | ||||
|       for (int i = 0; i < nInnerLoops; ++i) | ||||
|         innerLoops.pushBounds(0, kernelShape[i]); | ||||
|       // 2.2 Emit affine maps which express the lower and upper bounds for the
 | ||||
|       // pooling window's dimensions.
 | ||||
|       // The pooling window can be smaller than the kernel when slicing it over
 | ||||
|       // the border edges. Thus, we will compute the start and end indices for
 | ||||
|       // each dimension as follows.
 | ||||
|       //   firstValidH = ceil(float(ptH / dH)) * dH - ptH
 | ||||
|       //   startH = max(firstValidH, ho * sH - ptH)
 | ||||
|       //   endH = min(H, ho * sH + (kH - 1) * dH  + 1 - pbH)
 | ||||
|       //   hDim = round(float(endH - startH) / float(dH))
 | ||||
| 
 | ||||
|       // 2.3 Emit inner loop nest.
 | ||||
|       innerLoops.createIterateOp(); | ||||
|       rewriter.setInsertionPointToStart(innerLoops.getIterateBlock()); | ||||
|       { | ||||
|         // 3. Emit inner loop body
 | ||||
|         // t = D[n][c][s1 * r1 + k1 * d1][s2 * r2 + k2 * d2];
 | ||||
|         // R[n][c][r1][r2] = max(R[n][c][r1][r2], t);
 | ||||
| 
 | ||||
|         // 3.1 Prepare indices for accesing the data tensor.
 | ||||
|         SmallVector<Value, 4> dataIndices; | ||||
|         // 3.1.1 Batch indices: n, c
 | ||||
|         for (int i = 0; i < batchRank; ++i) | ||||
|           dataIndices.emplace_back(outerLoops.getInductionVar(i)); | ||||
|         // 3.1.2 Insert spatial indices: sX * rX + kX * dX
 | ||||
|         for (int i = batchRank; i < nOuterLoops; ++i) { | ||||
|           // Get index along the inner loop's induction variables.
 | ||||
|           // It is used to obtain kernel/pad/stride/dilation index.
 | ||||
|           int j = i - batchRank; | ||||
| 
 | ||||
|           Value spatialIndex = outerLoops.getInductionVar(i); | ||||
|           // If strides are present (not default) then emit the correct access
 | ||||
|           // index.
 | ||||
|           // sX *= rX
 | ||||
|           if (strides[i - batchRank] > 1) { | ||||
|             auto strideIndex = emitConstantOp( | ||||
|                 rewriter, loc, rewriter.getIndexType(), strides[j]); | ||||
|             spatialIndex = rewriter.create<MulIOp>( | ||||
|                 loc, strideIndex, outerLoops.getInductionVar(i)); | ||||
|           } | ||||
| 
 | ||||
|           // Dilate the kernel index only if the dilation value is not one (not
 | ||||
|           // default). Otherwise, just add kernelIndex.
 | ||||
|           auto kernelIndex = innerLoops.getInductionVar(j); | ||||
|           if (dilations[j] > 1) { | ||||
|             // sX += dX * kW
 | ||||
|             auto dilationIndex = emitConstantOp( | ||||
|                 rewriter, loc, rewriter.getIndexType(), dilations[j]); | ||||
|             auto dilationKernelIndex = | ||||
|                 rewriter.create<MulIOp>(loc, dilationIndex, kernelIndex); | ||||
|             spatialIndex = | ||||
|                 rewriter.create<AddIOp>(loc, spatialIndex, dilationKernelIndex); | ||||
|       // Prepare induction variables and constants as arguments for the affine
 | ||||
|       // maps.
 | ||||
|       SmallVector<SmallVector<Value, 4>, 4> IVsAndConstants; | ||||
|       { // Construct IVsAndConstants.
 | ||||
|         for (int i = 0; i < kernelShape.size(); ++i) { | ||||
|           SmallVector<Value, 4> ic; | ||||
|           // d0, output
 | ||||
|           ic.emplace_back(outputLoops.getInductionVar(i + kernelOffset)); | ||||
|           // s0, input dim
 | ||||
|           if (inputShape[i + kernelOffset] < 0) { | ||||
|             ic.emplace_back( | ||||
|                 rewriter.create<DimOp>(loc, inputOperand, i + kernelOffset)); | ||||
|           } else { | ||||
|             // sX += kX
 | ||||
|             spatialIndex = | ||||
|                 rewriter.create<AddIOp>(loc, spatialIndex, kernelIndex); | ||||
|             ic.emplace_back(emitConstantOp(rewriter, loc, | ||||
|                 rewriter.getIndexType(), inputShape[i + kernelOffset])); | ||||
|           } | ||||
|           // s1, kernel dim
 | ||||
|           ic.emplace_back(emitConstantOp( | ||||
|               rewriter, loc, rewriter.getIndexType(), kernelShape[i])); | ||||
|           // s2, pad dim
 | ||||
|           ic.emplace_back( | ||||
|               emitConstantOp(rewriter, loc, rewriter.getIndexType(), pads[i])); | ||||
|           // s3, stride dim
 | ||||
|           ic.emplace_back(emitConstantOp( | ||||
|               rewriter, loc, rewriter.getIndexType(), strides[i])); | ||||
|           // s4, dilation dim
 | ||||
|           ic.emplace_back(emitConstantOp(rewriter, loc, rewriter.getIndexType(), | ||||
|               (isDilated) ? dilations[i] : 1)); | ||||
|           IVsAndConstants.emplace_back(ic); | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|           // If ceil mode or dilation is enabled, then the calculated access
 | ||||
|           // index may exceed its dimension. In such a case, we will use the
 | ||||
|           // maximum index, which causes multiple visits to the element of the
 | ||||
|           // maximum index.
 | ||||
|           // TODO: Avoid multiple visits.
 | ||||
|           // Example of out-of-bound.
 | ||||
|           // - Given a 5x5 input X
 | ||||
|           //  X = [[0, 0, 0, 0, 0],
 | ||||
|           //       [1, 1, 1, 1, 1],
 | ||||
|           //       [2, 2, 2, 2, 2],
 | ||||
|           //       [3, 3, 3, 3, 3],
 | ||||
|           //       [4, 4, 4, 4, 4]]
 | ||||
|           // - Do MaxPool with strides=[2, 2], kernel=[2, 2], ceilMode=true,
 | ||||
|           // output is a 3x3 array:
 | ||||
|           // Y = [[1, 1, 1],
 | ||||
|           //      [3, 3, 3],
 | ||||
|           //      [4, 4, 4]]
 | ||||
|           // - When computing Y[2, 0]:
 | ||||
|           //    - In case of kernelIndex = 1, stride = 2
 | ||||
|           //      - No dilation: spatialIndex = 2 * 2 + 1 = 5
 | ||||
|           //        => out of bound
 | ||||
|           //      - dilation = 2: spatialIndex = 2 * 2 + 2 * 1 = 6
 | ||||
|           //        => out of bound
 | ||||
|           if (dilations[j] > 1 or ceilMode) { | ||||
|             Value upperIndex; | ||||
|             if (inputShape[i] < 0) { | ||||
|               Value inputDim = rewriter.create<DimOp>(loc, inputOperand, i); | ||||
|               Value one = rewriter.create<ConstantIndexOp>(loc, 1); | ||||
|               upperIndex = rewriter.create<SubIOp>(loc, inputDim, one); | ||||
|             } else { | ||||
|               upperIndex = | ||||
|                   rewriter.create<ConstantIndexOp>(loc, inputShape[i] - 1); | ||||
|       // Affine maps for the pooling window.
 | ||||
|       AffineMap poolStartMap, poolEndMap, poolDimMap; | ||||
|       { // Construct poolStartMap, poolEndMap and poolDimMap.
 | ||||
|         // AffineExpr(s) to obtain the dimensions and symbols.
 | ||||
|         AffineExpr outputIndex = rewriter.getAffineDimExpr(0); | ||||
|         AffineExpr inputDim = rewriter.getAffineSymbolExpr(0); | ||||
|         AffineExpr kernelDim = rewriter.getAffineSymbolExpr(1); | ||||
|         AffineExpr padTopDim = rewriter.getAffineSymbolExpr(2); | ||||
|         AffineExpr strideDim = rewriter.getAffineSymbolExpr(3); | ||||
|         AffineExpr dilationDim = rewriter.getAffineSymbolExpr(4); | ||||
|         AffineExpr start1 = | ||||
|             (padTopDim).ceilDiv(dilationDim) * dilationDim - padTopDim; | ||||
|         AffineExpr start2 = outputIndex * strideDim - padTopDim; | ||||
|         AffineExpr end1 = inputDim; | ||||
|         AffineExpr end2 = outputIndex * strideDim + | ||||
|                           (kernelDim - 1) * dilationDim + 1 - padTopDim; | ||||
| 
 | ||||
|         // poolDimMap
 | ||||
|         SmallVector<AffineExpr, 4> dimExpr; | ||||
|         // Upperbound for an affine.for is `min AffineMap`, where `min` is
 | ||||
|         // automatically inserted when an affine.for is constructed from
 | ||||
|         // an AffineMap, thus we rewrite `endH - startH` as follows:
 | ||||
|         //   endH - start H
 | ||||
|         //     = min(end1, end2) - max(start1, start2)
 | ||||
|         //     = min(end1 - start1, end1 - start2, end2 - start1, end2 - start2)
 | ||||
|         AffineExpr dimExpr1 = end1 - start1; | ||||
|         AffineExpr dimExpr2 = end1 - start2; | ||||
|         AffineExpr dimExpr3 = end2 - start1; | ||||
|         AffineExpr dimExpr4 = end2 - start2; | ||||
|         for (AffineExpr de : {dimExpr1, dimExpr2, dimExpr3, dimExpr4}) { | ||||
|           if (isDilated) { | ||||
|             de = de + 1; | ||||
|             de = | ||||
|                 (ceilMode) ? de.ceilDiv(dilationDim) : de.floorDiv(dilationDim); | ||||
|           } | ||||
|           dimExpr.emplace_back(de); | ||||
|         } | ||||
|         poolDimMap = AffineMap::get(1, 5, dimExpr); | ||||
| 
 | ||||
|         // poolStartMap and poolEndMap
 | ||||
|         poolStartMap = AffineMap::get(1, 5, {start1, start2}); | ||||
|         poolEndMap = AffineMap::get(1, 5, {end1, end2}); | ||||
|       } | ||||
| 
 | ||||
|       // Obtain values from the affine maps.
 | ||||
|       SmallVector<Value, 4> poolStartValues; | ||||
|       SmallVector<Value, 4> poolDimValues; | ||||
|       { // Construct poolStartValues and poolDimValues.
 | ||||
|         for (int i = 0; i < kernelShape.size(); ++i) { | ||||
|           Value startIndex = rewriter.create<AffineMaxOp>( | ||||
|               loc, poolStartMap, ValueRange(IVsAndConstants[i])); | ||||
|           poolStartValues.emplace_back(startIndex); | ||||
| 
 | ||||
|           Value endIndex = rewriter.create<AffineMinOp>( | ||||
|               loc, poolEndMap, ValueRange(IVsAndConstants[i])); | ||||
| 
 | ||||
|           Value dim = rewriter.create<SubIOp>(loc, endIndex, startIndex); | ||||
|           if (isDilated) { | ||||
|             Value one = | ||||
|                 emitConstantOp(rewriter, loc, rewriter.getIndexType(), 1); | ||||
|             Value numerator = rewriter.create<AddIOp>(loc, dim, one); | ||||
|             Value denominator = IVsAndConstants[i][5]; // dilations[i]
 | ||||
|             dim = rewriter.create<SignedDivIOp>(loc, numerator, denominator); | ||||
|             if (ceilMode) { | ||||
|               auto remainder = | ||||
|                   rewriter.create<SignedRemIOp>(loc, numerator, denominator); | ||||
|               Value zero = | ||||
|                   emitConstantOp(rewriter, loc, rewriter.getIndexType(), 0); | ||||
|               auto isZero = rewriter.create<CmpIOp>( | ||||
|                   loc, CmpIPredicate::eq, remainder, zero); | ||||
|               auto dimPlusOne = rewriter.create<AddIOp>(loc, dim, one); | ||||
|               dim = rewriter.create<SelectOp>(loc, isZero, dim, dimPlusOne); | ||||
|             } | ||||
|             auto greaterCondition = rewriter.create<CmpIOp>( | ||||
|                 loc, CmpIPredicate::sgt, spatialIndex, upperIndex); | ||||
|             spatialIndex = rewriter.create<SelectOp>( | ||||
|                 loc, greaterCondition, upperIndex, spatialIndex); | ||||
|           } | ||||
|           poolDimValues.emplace_back(dim); | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|           dataIndices.emplace_back(spatialIndex); | ||||
|       // 2.3 Define pooling loops.
 | ||||
|       //  for hp in range(hDim):
 | ||||
|       //    for wp in range(wDim):
 | ||||
|       //      hi = hp * dH + startH
 | ||||
|       //      wi = wp * dW + startW
 | ||||
|       //      output[n][c][ho][wo] =
 | ||||
|       //        emitScalarOpFor(output[n][c][ho][wo], input[n, c, hi, wi]);
 | ||||
|       BuildKrnlLoop poolingLoops(rewriter, loc, kernelShape.size()); | ||||
|       poolingLoops.createDefineAndOptimizeOp(); | ||||
|       for (int i = 0; i < kernelShape.size(); ++i) | ||||
|         poolingLoops.pushBounds( | ||||
|             0, poolDimMap, llvm::makeArrayRef(IVsAndConstants[i])); | ||||
|       poolingLoops.createIterateOp(); | ||||
| 
 | ||||
|       auto ipOuterLoops = rewriter.saveInsertionPoint(); | ||||
|       rewriter.setInsertionPointToStart(poolingLoops.getIterateBlock()); | ||||
|       { | ||||
|         // 2.4 Emit the body of the pooling loop nest.
 | ||||
|         // Prepare indices to access a pixel in the input.
 | ||||
|         std::vector<Value> inputIndices; | ||||
|         { // Construct inputIndices
 | ||||
|           for (int i = 0; i < kernelOffset; ++i) | ||||
|             inputIndices.emplace_back(outputIndices[i]); | ||||
|           for (int i = kernelOffset; i < inputShape.size(); ++i) { | ||||
|             int j = i - kernelOffset; | ||||
|             if (isDilated) { | ||||
|               // hi = hp * dH + startH
 | ||||
|               Value index = rewriter.create<MulIOp>( | ||||
|                   loc, poolingLoops.getInductionVar(j), IVsAndConstants[j][5]); | ||||
|               index = rewriter.create<AddIOp>(loc, index, poolStartValues[j]); | ||||
|               inputIndices.emplace_back(index); | ||||
|             } else { | ||||
|               // hi = hp + startH
 | ||||
|               inputIndices.emplace_back(rewriter.create<AddIOp>( | ||||
|                   loc, poolingLoops.getInductionVar(j), poolStartValues[j])); | ||||
|             } | ||||
|           } | ||||
|         } | ||||
| 
 | ||||
|         // 3.2 Do pooling.
 | ||||
|         auto loadData = rewriter.create<LoadOp>(loc, inputOperand, dataIndices); | ||||
|         auto loadPartialResult = | ||||
|             rewriter.create<LoadOp>(loc, alloc, resultIndices); | ||||
|         Value result = emitScalarOpFor<ONNXMaxPoolSingleOutOp>(rewriter, loc, | ||||
|             op, resultElementType, {loadPartialResult, loadData}); | ||||
|         rewriter.create<StoreOp>(loc, result, alloc, resultIndices); | ||||
|         // Apply pooling operation.
 | ||||
|         //      output[n][c][ho][wo] =
 | ||||
|         //        emitScalarOpFor(output[n][c][ho][wo], input[n, c, hi, wi]);
 | ||||
|         Value loadInput = | ||||
|             rewriter.create<LoadOp>(loc, inputOperand, inputIndices); | ||||
|         Value loadPartialOutput = | ||||
|             rewriter.create<LoadOp>(loc, alloc, outputIndices); | ||||
|         Value output = emitScalarOpFor<PoolOp>(rewriter, loc, op, | ||||
|             outputElementType, {loadPartialOutput, loadInput}); | ||||
|         rewriter.create<StoreOp>(loc, output, alloc, outputIndices); | ||||
|       } | ||||
| 
 | ||||
|       // 2.5 Post-processing for the pooling window, e.g. taking average.
 | ||||
|       rewriter.restoreInsertionPoint(ipOuterLoops); | ||||
|       postProcessPoolingWindow<PoolOp>(rewriter, loc, poolOp, alloc, | ||||
|           outputIndices, kernelShape, poolDimValues); | ||||
|     } | ||||
| 
 | ||||
|     // Go back to the main region.
 | ||||
|     rewriter.restoreInsertionPoint(ipMainRegion); | ||||
| 
 | ||||
|     rewriter.replaceOp(op, alloc); | ||||
| 
 | ||||
|     return success(); | ||||
|  | @ -320,5 +565,6 @@ struct ONNXMaxPoolSingleOutOpLowering : public ConversionPattern { | |||
| 
 | ||||
| void populateLoweringONNXPoolingOpPattern( | ||||
|     OwningRewritePatternList &patterns, MLIRContext *ctx) { | ||||
|   patterns.insert<ONNXMaxPoolSingleOutOpLowering>(ctx); | ||||
|   patterns.insert<ONNXPoolOpLowering<ONNXMaxPoolSingleOutOp>>(ctx); | ||||
|   patterns.insert<ONNXPoolOpLowering<ONNXAveragePoolOp>>(ctx); | ||||
| } | ||||
|  |  | |||
|  | @ -149,6 +149,15 @@ void KrnlIterateOperandPack::pushOperandBound(Value operand) { | |||
|   _operands.emplace_back(operand); | ||||
| } | ||||
| 
 | ||||
| void KrnlIterateOperandPack::pushAffineMapBound( | ||||
|     AffineMap map, ArrayRef<Value> operands) { | ||||
|   if (boundMaps.size() % 2 == 0) | ||||
|     _operands.emplace_back(inputLoops[boundMaps.size() / 2]); | ||||
|   boundMaps.emplace_back(AffineMapAttr::get(map)); | ||||
|   for (auto operand : operands) | ||||
|     _operands.emplace_back(operand); | ||||
| } | ||||
| 
 | ||||
| BuildKrnlLoop::BuildKrnlLoop( | ||||
|     ConversionPatternRewriter &rewriter, Location loc, int loopNum) | ||||
|     : rewriter(rewriter), loc(loc), originalLoopNum(loopNum), pack(NULL), | ||||
|  | @ -209,6 +218,13 @@ int BuildKrnlLoop::pushBounds(int64_t lowerBound, Value upperBound) { | |||
|   return pushCount++; | ||||
| } | ||||
| 
 | ||||
| int BuildKrnlLoop::pushBounds(int64_t lowerBound, AffineMap upperBound, | ||||
|     ArrayRef<Value> operandsForUpperBoundMap) { | ||||
|   pack->pushConstantBound(lowerBound); | ||||
|   pack->pushAffineMapBound(upperBound, operandsForUpperBoundMap); | ||||
|   return pushCount++; | ||||
| } | ||||
| 
 | ||||
| int BuildKrnlLoop::pushBounds(int64_t lowerBound, Value upperBoundMemRefOperand, | ||||
|     int upperBoundMemRefIndex, bool upperBoundMustBeConstant) { | ||||
|   pack->pushConstantBound(lowerBound); | ||||
|  |  | |||
|  | @ -87,6 +87,8 @@ struct KrnlIterateOperandPack { | |||
| 
 | ||||
|   void pushOperandBound(mlir::Value operand); | ||||
| 
 | ||||
|   void pushAffineMapBound(mlir::AffineMap map, ArrayRef<Value> operands); | ||||
| 
 | ||||
|   llvm::SmallVector<mlir::Value, 8> getOperands() const { return _operands; } | ||||
| 
 | ||||
|   mlir::ArrayAttr getAttributes() const { | ||||
|  | @ -159,6 +161,8 @@ public: | |||
|   // must be of MemRef type.
 | ||||
|   int pushBounds(int64_t lowerBound, int64_t upperBound); | ||||
|   int pushBounds(int64_t lowerBound, Value upperBound); | ||||
|   int pushBounds(int64_t lowerBound, AffineMap upperBound, | ||||
|       ArrayRef<Value> operandsForUpperBoundMap); | ||||
|   int pushBounds(Value lowerBound, Value upperBound); | ||||
|   int pushBounds(int64_t lowerBound, Value upperBoundMemRefOperand, | ||||
|       int upperBoundMemRefIndex, bool upperBoundMustBeConstant = false); | ||||
|  |  | |||
|  | @ -98,7 +98,6 @@ def ONNXEntryPointOp: ONNX_Op<"EntryPoint"> { | |||
| 
 | ||||
| def ONNXMaxPoolSingleOutOp: ONNX_Op<"MaxPoolSingleOut", | ||||
|     [NoSideEffect, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> { | ||||
|   let hasCanonicalizer = 1; | ||||
|   let summary = "ONNX MaxPool operation with a single output."; | ||||
|   let description = [{ | ||||
|     "ONNX MaxPool operation with a single output." | ||||
|  |  | |||
|  | @ -33,7 +33,7 @@ bool hasNonZeroInArrayAttr(ArrayAttr attrs) { | |||
| } | ||||
| 
 | ||||
| // Create an ArrayAttr of IntergerAttr(s) of zero values.
 | ||||
| // This function is used for padding attribute in MaxPoolSingleOut.
 | ||||
| // This function is used for padding attribute in Conv.
 | ||||
| ArrayAttr createArrayAttrOfZeros( | ||||
|     PatternRewriter &rewriter, ArrayAttr origAttrs) { | ||||
|   int nElements = origAttrs.getValue().size(); | ||||
|  | @ -51,7 +51,7 @@ ArrayAttr createArrayAttrOfZeros( | |||
| //         |_____|                  |_____|
 | ||||
| //                 nZeros                    nZeros
 | ||||
| //
 | ||||
| // This function is used for padding attribute in MaxPoolSingleOut.
 | ||||
| // This function is used for padding attribute in Conv.
 | ||||
| ArrayAttr insertZerosForNonPaddedDims( | ||||
|     PatternRewriter &rewriter, ArrayAttr origAttrs, int extensionLength) { | ||||
|   int nDims = (int)origAttrs.getValue().size() / 2; | ||||
|  | @ -72,11 +72,6 @@ ArrayAttr insertZerosForNonPaddedDims( | |||
| 
 | ||||
| } // end anonymous namespace
 | ||||
| 
 | ||||
| /// on the ONNXMaxPoolSingleOutOp.
 | ||||
| void ONNXMaxPoolSingleOutOp::getCanonicalizationPatterns( | ||||
|     OwningRewritePatternList &results, MLIRContext *context) { | ||||
|   results.insert<MaxPoolSingleOutOpPaddingPattern>(context); | ||||
| } | ||||
| /// on the ONNXConvOp.
 | ||||
| void ONNXConvOp::getCanonicalizationPatterns( | ||||
|     OwningRewritePatternList &results, MLIRContext *context) { | ||||
|  |  | |||
|  | @ -33,13 +33,8 @@ class StringAttrOfValue<string val>: | |||
| class FloatAttrOfValue<int val>: | ||||
|   NativeCodeCall<"FloatAttr::get($0.getType().cast<TensorType>().getElementType(), " # val # ")">; | ||||
| 
 | ||||
| // Create a FloatAttr for the negative infinity. | ||||
| def FloatAttrOfNegativeInfinity: | ||||
|   NativeCodeCall<"FloatAttr::get($0.getType().cast<TensorType>().getElementType(), " | ||||
|                                 "-std::numeric_limits<double>::infinity())">; | ||||
| 
 | ||||
| // Create an ArrayAttr of IntergerAttr(s) of zero values. | ||||
| // This function is used for padding attribute in MaxPoolSingleOut. | ||||
| // This function is used for padding attribute in Conv. | ||||
| def createArrayAttrOfZerosFrom: | ||||
|   NativeCodeCall<"createArrayAttrOfZeros($_builder, $0)">; | ||||
| 
 | ||||
|  | @ -53,7 +48,7 @@ def createArrayAttrOfZerosFrom: | |||
| //         |_____|                  |_____| | ||||
| //                 nZeros                    nZeros | ||||
| // | ||||
| // This function is used for padding attribute in MaxPoolSingleOut. | ||||
| // This function is used for padding attribute in Conv. | ||||
| class insertZerosForNonPaddedDims<int extensionLength>: | ||||
|   NativeCodeCall<"insertZerosForNonPaddedDims($_builder, $0," | ||||
|                                               # extensionLength # ")">; | ||||
|  | @ -66,37 +61,6 @@ def HasNonZeroInArrayAttr: Constraint<CPred<"hasNonZeroInArrayAttr($_self)">, | |||
| class IsNotStringAttrOfValue<string val>: | ||||
|   Constraint<CPred<"$0.cast<StringAttr>().getValue() != \"" # val # "\"">>; | ||||
| 
 | ||||
| //===----------------------------------------------------------------------===// | ||||
| // Rewrite: | ||||
| // %0 = onnx.MaxPoolSingleOutOp(%D : tensor<DShape>) | ||||
| //     {pads = [b0, b1, ... bK, e0, e1, ..., eK]} -> | ||||
| //         tensor<OutShape> | ||||
| // | ||||
| // as: | ||||
| // %0 = onnx.PadConstantValuePadOp(%D) | ||||
| //     {pads = [0, 0, b0, b1, ... bK, 0, 0, e0, e1, ..., eK]} -> | ||||
| //     tensor<DPaddedShape> | ||||
| // %1 = onnx.MaxPoolSingleOut(%0 : tensor<DPaddedShape>) {pads = [0, ..., 0]} -> | ||||
| //     tensor<OutShape> | ||||
| //===----------------------------------------------------------------------===// | ||||
| 
 | ||||
| def MaxPoolSingleOutOpPaddingPattern: Pat< | ||||
|   (ONNXMaxPoolSingleOutOp:$res | ||||
|      $x, | ||||
|      $auto_pad, $ceil_mode, $dilation, $kernel_shape, | ||||
|      $pads, | ||||
|      $storage_order, $strides), | ||||
|   (ONNXMaxPoolSingleOutOp | ||||
|      (ONNXPadConstantValuePadOp $x, | ||||
|         (insertZerosForNonPaddedDims<2> $pads), | ||||
|         (FloatAttrOfNegativeInfinity $res), | ||||
|         (StringAttrOfValue<"constant">)), | ||||
|      $auto_pad, $ceil_mode, $dilation, $kernel_shape, | ||||
|      (createArrayAttrOfZerosFrom $pads), | ||||
|      $storage_order, $strides), | ||||
|   [(HasNonZeroInArrayAttr:$pads), (IsNotStringAttrOfValue<"VALID"> $auto_pad)] | ||||
| >; | ||||
| 
 | ||||
| //===----------------------------------------------------------------------===// | ||||
| // Rewrite: | ||||
| // %0 = onnx.ConvOp(%D : tensor<DShape>, %K) | ||||
|  |  | |||
|  | @ -327,7 +327,7 @@ test_to_enable = [ | |||
|     "test_batchnorm_epsilon_cpu", | ||||
|     "test_batchnorm_example_cpu", | ||||
| 
 | ||||
|     # Pooling | ||||
|     # MaxPoolSingleOut | ||||
|     "test_maxpool_1d_default_cpu", | ||||
|     "test_maxpool_2d_ceil_cpu", | ||||
|     "test_maxpool_2d_default_cpu", | ||||
|  | @ -341,6 +341,21 @@ test_to_enable = [ | |||
|     "test_maxpool_2d_strides_cpu", | ||||
|     "test_maxpool_3d_default_cpu", | ||||
| 
 | ||||
|     # AveragePool | ||||
|     "test_averagepool_1d_default_cpu", | ||||
|     "test_averagepool_2d_ceil_cpu", | ||||
|     "test_averagepool_2d_default_cpu", | ||||
|     "test_averagepool_2d_pads_count_include_pad_cpu", | ||||
|     "test_averagepool_2d_pads_cpu", | ||||
|     "test_averagepool_2d_precomputed_pads_count_include_pad_cpu", | ||||
|     "test_averagepool_2d_precomputed_pads_cpu", | ||||
|     "test_averagepool_2d_precomputed_same_upper_cpu", | ||||
|     "test_averagepool_2d_precomputed_strides_cpu", | ||||
|     "test_averagepool_2d_same_lower_cpu", | ||||
|     "test_averagepool_2d_same_upper_cpu", | ||||
|     "test_averagepool_2d_strides_cpu", | ||||
|     "test_averagepool_3d_default_cpu", | ||||
| 
 | ||||
| ] | ||||
| 
 | ||||
| # Extract name of all test cases. | ||||
|  |  | |||
|  | @ -94,27 +94,3 @@ func @test_gemm_add_fusion_rank3(%arg0: tensor<128x128x256xf32>, %arg1: tensor<1 | |||
|   // return [[GEMM]] : tensor<*xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| //CHECK-LABEL: @test_maxpoolsingleout_split(%{{.*}}: tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32> { | ||||
| func @test_maxpoolsingleout_split(%arg0: tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", ceil_mode = 0, kernel_shape = [5,3], pads = [1, 2, 3, 4] } : (tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32> | ||||
|   "std.return"(%0) : (tensor<5x5x36x38xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-NEXT: %0 = "onnx.PadConstantValuePad"(%arg0) {constant_value = 0xFF800000 : f32, mode = "constant", pads = [0, 0, 1, 2, 0, 0, 3, 4]} : (tensor<5x5x32x32xf32>) -> tensor<5x5x36x38xf32> | ||||
|   // CHECK-NEXT: %1 = "onnx.MaxPoolSingleOut"(%0) {auto_pad = "NOTSET", ceil_mode = 0 : i64, kernel_shape = [5, 3], pads = [0, 0, 0, 0], storage_order = 0 : i64} : (tensor<5x5x36x38xf32>) -> tensor<5x5x36x38xf32> | ||||
|   // CHECK-NEXT: return %1 : tensor<5x5x36x38xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| //CHECK-LABEL: @test_maxpoolsingleout_split_unknown_dims(%{{.*}}: tensor<*xf32>) -> tensor<*xf32> { | ||||
| func @test_maxpoolsingleout_split_unknown_dims(%arg0: tensor<*xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", ceil_mode = 0, kernel_shape = [5,3], pads = [1, 2, 3, 4] } : (tensor<*xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-NEXT: %0 = "onnx.PadConstantValuePad"(%arg0) {constant_value = 0xFF800000 : f32, mode = "constant", pads = [0, 0, 1, 2, 0, 0, 3, 4]} : (tensor<*xf32>) -> tensor<*xf32> | ||||
|   // CHECK-NEXT: %1 = "onnx.MaxPoolSingleOut"(%0) {auto_pad = "NOTSET", ceil_mode = 0 : i64, kernel_shape = [5, 3], pads = [0, 0, 0, 0], storage_order = 0 : i64} : (tensor<*xf32>) -> tensor<*xf32> | ||||
|   // CHECK-NEXT: return %1 : tensor<*xf32> | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -1505,231 +1505,6 @@ func @test_batchnorm_testmode_1d(%arg0: tensor<10xf32>, %arg1: tensor<1xf32>, %a | |||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpooling_singleout_no_pad(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: test_maxpooling_singleout_no_pad | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32> | ||||
|   // CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 31, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 31) { | ||||
|   // CHECK:   [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32 | ||||
|   // CHECK:   store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 2, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 2) { | ||||
|   // CHECK:     [[H:%.+]] = addi %arg3, %arg5 : index | ||||
|   // CHECK:     [[W:%.+]] = addi %arg4, %arg6 : index | ||||
|   // CHECK:     [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:     [[COMPARE:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     [[SELECT:%.+]] = select [[COMPARE]], [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   } | ||||
|   // CHECK: } | ||||
|   // CHECK: return [[RES]] : memref<1x3x31x31xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpooling_singleout_no_pad_w_strides(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2], strides = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x16x16xf32> | ||||
|   // CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 16, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 16) { | ||||
|   // CHECK:   [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32 | ||||
|   // CHECK:   store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32> | ||||
|   // CHECK:   [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 2, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 2) { | ||||
|   // CHECK:     [[STRIDE_0:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index | ||||
|   // CHECK:     [[H:%.+]] = addi [[MUL_0]], %arg5 : index | ||||
|   // CHECK:     [[STRIDE_1:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg4 : index | ||||
|   // CHECK:     [[W:%.+]] = addi [[MUL_1]], %arg6 : index | ||||
|   // CHECK:     [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32> | ||||
|   // CHECK:     [[COMPARE:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     [[SELECT:%.+]] = select [[COMPARE]], [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32> | ||||
|   // CHECK:   } | ||||
|   // CHECK: } | ||||
|   // CHECK: return [[RES]] : memref<1x3x16x16xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [3, 3], strides = [2, 2], ceil_mode = 1} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x16x16xf32> | ||||
|   // CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 16, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 16) { | ||||
|   // CHECK:   [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32 | ||||
|   // CHECK:   store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32> | ||||
|   // CHECK:   [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 3, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 3) { | ||||
|   // CHECK:     [[STRIDE_0:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index | ||||
|   // CHECK:     [[SPATIAL_H:%.+]] = addi [[MUL_0]], %arg5 : index | ||||
|   // CHECK:     [[UPPER_INDEX_0:%.+]] = constant 31 : index | ||||
|   // CHECK:     [[GREATER_THAN_UPPER_0:%.+]] = cmpi "sgt", [[SPATIAL_H]], [[UPPER_INDEX_0]] : index | ||||
|   // CHECK:     [[H:%.+]] = select [[GREATER_THAN_UPPER_0]], [[UPPER_INDEX_0]], [[SPATIAL_H]] : index | ||||
| 
 | ||||
|   // CHECK:     [[STRIDE_1:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg4 : index | ||||
|   // CHECK:     [[SPATIAL_W:%.+]] = addi [[MUL_1]], %arg6 : index | ||||
|   // CHECK:     [[UPPER_INDEX_1:%.+]] = constant 31 : index | ||||
|   // CHECK:     [[GREATER_THAN_UPPER_1:%.+]] = cmpi "sgt", [[SPATIAL_W]], [[UPPER_INDEX_1]] : index | ||||
|   // CHECK:     [[W:%.+]] = select [[GREATER_THAN_UPPER_1]], [[UPPER_INDEX_1]], [[SPATIAL_W]] : index | ||||
| 
 | ||||
|   // CHECK:     [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32> | ||||
|   // CHECK:     [[CMP_2:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     [[SELECT:%.+]] = select [[CMP_2]], [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x16x16xf32> | ||||
|   // CHECK:   } | ||||
|   // CHECK: } | ||||
|   // CHECK: return [[RES]] : memref<1x3x16x16xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpooling_singleout_no_pad_w_strides_w_dilation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [3, 3], strides = [2, 2], dilations = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides_w_dilation | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x14x14xf32> | ||||
|   // CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to 1, [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to 14, [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 14) { | ||||
|   // CHECK:   [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32 | ||||
|   // CHECK:   store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x14x14xf32> | ||||
|   // CHECK:   [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 3, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 3) { | ||||
|   // CHECK:     [[STRIDE_0:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index | ||||
|   // CHECK:     [[STRIDE_1:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg5 : index | ||||
|   // CHECK:     [[SPATIAL_H:%.+]] = addi [[MUL_0]], [[MUL_1]] : index | ||||
|   // CHECK:     [[UPPER_INDEX_0:%.+]] = constant 31 : index | ||||
|   // CHECK:     [[GREATER_THAN_UPPER_0:%.+]] = cmpi "sgt", [[SPATIAL_H]], [[UPPER_INDEX_0]] : index | ||||
|   // CHECK:     [[H:%.+]] = select [[GREATER_THAN_UPPER_0]], [[UPPER_INDEX_0]], [[SPATIAL_H]] : index | ||||
| 
 | ||||
|   // CHECK:     [[STRIDE_0_1:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_0_1:%.+]] = muli [[STRIDE_0_1]], %arg4 : index | ||||
|   // CHECK:     [[STRIDE_1_1:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_1_1:%.+]] = muli [[STRIDE_1_1]], %arg6 : index | ||||
|   // CHECK:     [[SPATIAL_W:%.+]] = addi [[MUL_0_1]], [[MUL_1_1]] : index | ||||
|   // CHECK:     [[UPPER_INDEX_1:%.+]] = constant 31 : index | ||||
|   // CHECK:     [[GREATER_THAN_UPPER_1:%.+]] = cmpi "sgt", [[SPATIAL_W]], [[UPPER_INDEX_1]] : index | ||||
|   // CHECK:     [[W:%.+]] = select [[GREATER_THAN_UPPER_1]], [[UPPER_INDEX_1]], [[SPATIAL_W]] : index | ||||
| 
 | ||||
|   // CHECK:     [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x14x14xf32> | ||||
|   // CHECK:     [[CMP_2:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     [[SELECT:%.+]] = select [[CMP_2]], [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x14x14xf32> | ||||
|   // CHECK:   } | ||||
|   // CHECK: } | ||||
|   // CHECK: return [[RES]] : memref<1x3x14x14xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode_w_unknown_dims(%arg0 : tensor<?x3x?x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [3, 3], strides = [2, 2], ceil_mode = 1} : (tensor<?x3x?x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: test_maxpooling_singleout_no_pad_w_strides_w_ceil_mode_w_unknown_dims | ||||
| 
 | ||||
|   // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x3x?x32xf32> | ||||
|   // CHECK: [[ZERO:%.+]] = constant 0 : i64 | ||||
|   // CHECK: [[ONE:%.+]] = constant 1 : i64 | ||||
|   // CHECK: [[DIM_1:%.+]] = dim %arg0, 2 : memref<?x3x?x32xf32> | ||||
|   // CHECK: [[DIM_1_i64:%.+]] = index_cast [[DIM_1]] : index to i64 | ||||
|   // CHECK: [[KERNEL_PAD_DILATION:%.+]] = constant -3 : i64 | ||||
|   // CHECK: [[NUMERATOR:%.+]] = addi [[DIM_1_i64]], [[KERNEL_PAD_DILATION]] : i64 | ||||
|   // CHECK: [[DENOMINATOR:%.+]] = constant 2 : i64 | ||||
|   // CHECK: [[DIV:%.+]] = divi_signed [[NUMERATOR]], [[DENOMINATOR]] : i64 | ||||
|   // CHECK: [[REMAINDER:%.+]] = remi_signed [[NUMERATOR]], [[DENOMINATOR]] : i64 | ||||
|   // CHECK: [[IS_ZERO:%.+]] = cmpi "eq", [[REMAINDER]], [[ZERO]] : i64 | ||||
|   // CHECK: [[DIV_PLUS_ONE:%.+]] = addi [[DIV]], [[ONE]] : i64 | ||||
|   // CHECK: [[SELECT:%.+]] = select [[IS_ZERO]], [[DIV]], [[DIV_PLUS_ONE]] : i64 | ||||
|   // CHECK: [[SELECT_PLUS_ONE:%.+]] = addi [[SELECT]], [[ONE]] : i64 | ||||
|   // CHECK: [[DIM_1_FINAL:%.+]] = index_cast [[SELECT_PLUS_ONE]] : i64 to index | ||||
|   // CHECK: [[RES:%.+]] = alloc([[DIM_0]], [[DIM_1_FINAL]]) : memref<?x3x?x16xf32> | ||||
| 
 | ||||
|   // CHECK: [[DEF_LOOPS_0:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_LOOPS_0:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[DEF_LOOPS_0]]#0, [[DEF_LOOPS_0]]#1, [[DEF_LOOPS_0]]#2, [[DEF_LOOPS_0]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x3x?x16xf32> | ||||
|   // CHECK: [[DIM_3:%.+]] = dim [[RES]], 2 : memref<?x3x?x16xf32> | ||||
|   // CHECK: krnl.iterate([[OPT_LOOPS_0]]#0, [[OPT_LOOPS_0]]#1, [[OPT_LOOPS_0]]#2, [[OPT_LOOPS_0]]#3) with ([[DEF_LOOPS_0]]#0 -> %arg1 = 0 to [[DIM_2]], [[DEF_LOOPS_0]]#1 -> %arg2 = 0 to 3, [[DEF_LOOPS_0]]#2 -> %arg3 = 0 to [[DIM_3]], [[DEF_LOOPS_0]]#3 -> %arg4 = 0 to 16) { | ||||
|   // CHECK:   [[NEGATIVE_INFINITY:%.+]] = constant 0xFF800000 : f32 | ||||
|   // CHECK:   store [[NEGATIVE_INFINITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<?x3x?x16xf32> | ||||
|   // CHECK:   [[DEF_LOOPS_1:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_LOOPS_1:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[DEF_LOOPS_1]]#0, [[DEF_LOOPS_1]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_LOOPS_1]]#0, [[OPT_LOOPS_1]]#1) with ([[DEF_LOOPS_1]]#0 -> %arg5 = 0 to 3, [[DEF_LOOPS_1]]#1 -> %arg6 = 0 to 3) { | ||||
|   // CHECK:     [[STRIDE_0:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_0:%.+]] = muli [[STRIDE_0]], %arg3 : index | ||||
|   // CHECK:     [[SPATIAL_H:%.+]] = addi [[MUL_0]], %arg5 : index | ||||
|   // CHECK:     [[DIM_0_0:%.+]] = dim %arg0, 2 : memref<?x3x?x32xf32> | ||||
|   // CHECK:     [[ONE_INDEX:%.+]] = constant 1 : index | ||||
|   // CHECK:     [[UPPER_INDEX_0:%.+]] = subi [[DIM_0_0]], [[ONE_INDEX]] : index | ||||
|   // CHECK:     [[GREATER_THAN_UPPER_0:%.+]] = cmpi "sgt", [[SPATIAL_H]], [[UPPER_INDEX_0]] : index | ||||
|   // CHECK:     [[H:%.+]] = select [[GREATER_THAN_UPPER_0]], [[UPPER_INDEX_0]], [[SPATIAL_H]] : index | ||||
| 
 | ||||
|   // CHECK:     [[STRIDE_1:%.+]] = constant 2 : index | ||||
|   // CHECK:     [[MUL_1:%.+]] = muli [[STRIDE_1]], %arg4 : index | ||||
|   // CHECK:     [[SPATIAL_W:%.+]] = addi [[MUL_1]], %arg6 : index | ||||
|   // CHECK:     [[UPPER_INDEX_1:%.+]] = constant 31 : index | ||||
|   // CHECK:     [[GREATER_THAN_UPPER_1:%.+]] = cmpi "sgt", [[SPATIAL_W]], [[UPPER_INDEX_1]] : index | ||||
|   // CHECK:     [[W:%.+]] = select [[GREATER_THAN_UPPER_1]], [[UPPER_INDEX_1]], [[SPATIAL_W]] : index | ||||
| 
 | ||||
|   // CHECK:     [[LOAD_X:%.+]] = load %arg0[%arg1, %arg2, [[H]], [[W]]] : memref<?x3x?x32xf32> | ||||
|   // CHECK:     [[LOAD_Y:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<?x3x?x16xf32> | ||||
|   // CHECK:     [[CMP_2:%.+]] = cmpf "ogt", [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     [[SELECT:%.+]] = select [[CMP_2]], [[LOAD_Y]], [[LOAD_X]] : f32 | ||||
|   // CHECK:     store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<?x3x?x16xf32> | ||||
|   // CHECK:   } | ||||
|   // CHECK: } | ||||
|   // CHECK: return [[RES]] : memref<?x3x?x16xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_abs_float(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.Abs"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
|  | @ -1810,7 +1585,6 @@ func @test_constant_dense_2d_value(%arg0: tensor<1xf32>) -> tensor<*xf32> { | |||
|   // CHECK: return [[RES]] : memref<3x2xf32> | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_concat_1(%arg0 : tensor<5x5x1x32xf32>, %arg1 : tensor<5x5x3x32xf32>, %arg2 : tensor<5x5x5x32xf32>) -> tensor<5x5x9x32xf32> { | ||||
|  | @ -1849,3 +1623,133 @@ func @test_concat_1(%arg0 : tensor<5x5x1x32xf32>, %arg1 : tensor<5x5x3x32xf32>, | |||
| 
 | ||||
|   // CHECK: return [[RES]] :  memref<5x5x9x32xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_pool_general_computation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> ((s2 ceildiv s4) * s4 - s2, d0 * s3 - s2)> | ||||
|   // CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0, d0 * s3 + (s1 - 1) * s4 - s2 + 1)> | ||||
|   // CHECK-DAG: #{{.*}} = affine_map<() -> (0)> | ||||
|   // CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0 - ((s2 ceildiv s4) * s4 - s2), -(d0 * s3 - s2) + s0, d0 * s3 + (s1 - 1) * s4 - s2 - ((s2 ceildiv s4) * s4 - s2) + 1, d0 * s3 + (s1 - 1) * s4 - s2 - (d0 * s3 - s2) + 1)> | ||||
| 
 | ||||
|   // CHECK-LABEL: @test_pool_general_computation | ||||
| 
 | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32> | ||||
|   // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32 | ||||
| 
 | ||||
|   // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_OUTPUT_LOOPS:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_OUTPUT_LOOPS]]#0, [[OPT_OUTPUT_LOOPS]]#1, [[OPT_OUTPUT_LOOPS]]#2, [[OPT_OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) { | ||||
| 
 | ||||
|   // CHECK:   store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
| 
 | ||||
|   // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_POOL_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_POOL_LOOPS]]#0, [[OPT_POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #map3(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #map3(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) { | ||||
|   // CHECK:     {{.*}} = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     {{.*}} = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:     store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   } | ||||
| 
 | ||||
|   // CHECK:   {{.*}} = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK: } | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_averagepool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: @test_averagepool_identity_value | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32> | ||||
|   // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32 | ||||
|   // CHECK: store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: @test_maxpool_identity_value | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32> | ||||
|   // CHECK: [[IDENTITY:%.+]] = constant 0xFF800000 : f32 | ||||
|   // CHECK: store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_averagepool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: @test_averagepool_pooling_operation | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32> | ||||
| 
 | ||||
|   // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_OUTPUT_LOOPS:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_OUTPUT_LOOPS]]#0, [[OPT_OUTPUT_LOOPS]]#1, [[OPT_OUTPUT_LOOPS]]#2, [[OPT_OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) { | ||||
| 
 | ||||
|   // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_POOL_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_POOL_LOOPS]]#0, [[OPT_POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #map3(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #map3(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) { | ||||
| 
 | ||||
|   // CHECK:     [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     [[OUTPUT_LOAD:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:     [[SUM:%.+]] = addf [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32 | ||||
|   // CHECK:     store [[SUM]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   } | ||||
| 
 | ||||
|   // CHECK:   [[NUMERATOR:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   [[AVERAGE:%.+]] = divf [[NUMERATOR]], {{.*}} : f32 | ||||
|   // CHECK:   store [[AVERAGE]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK: } | ||||
| } | ||||
| 
 | ||||
| // ----- | ||||
| 
 | ||||
| func @test_maxpool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> { | ||||
|   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32> | ||||
|   "std.return"(%0) : (tensor<*xf32>) -> () | ||||
| 
 | ||||
|   // CHECK-LABEL: @test_maxpool_pooling_operation | ||||
|   // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32> | ||||
| 
 | ||||
|   // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4 | ||||
|   // CHECK: [[OPT_OUTPUT_LOOPS:%.+]]:4 = krnl.optimize_loops  { | ||||
|   // CHECK:   krnl.return_loops [[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3 | ||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) | ||||
|   // CHECK: krnl.iterate([[OPT_OUTPUT_LOOPS]]#0, [[OPT_OUTPUT_LOOPS]]#1, [[OPT_OUTPUT_LOOPS]]#2, [[OPT_OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) { | ||||
| 
 | ||||
|   // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||
|   // CHECK:   [[OPT_POOL_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||
|   // CHECK:     krnl.return_loops [[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1 | ||||
|   // CHECK:   } : () -> (!krnl.loop, !krnl.loop) | ||||
|   // CHECK:   krnl.iterate([[OPT_POOL_LOOPS]]#0, [[OPT_POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #map3(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #map3(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) { | ||||
| 
 | ||||
|   // CHECK:     [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32> | ||||
|   // CHECK:     [[OUTPUT_LOAD:%.+]] = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:     [[GREATER:%.+]] = cmpf "ogt", [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32 | ||||
|   // CHECK:     [[SELECT:%.+]] = select [[GREATER]], [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32 | ||||
|   // CHECK:     store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK:   } | ||||
| 
 | ||||
|   // CHECK-NOT:   {{.*}} = load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK-NOT:   store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32> | ||||
|   // CHECK: } | ||||
| } | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue