Lower convolution to KRNL dialect. (#65)
* Ensure data shape is at least 4. * First version of convolution. * Simplify code for KRNL lowering. * Add test without padding or strides. * Refactor code for lowering frontend operations to KRNL dialect. * Add test for conv with no bias and no padding. * Add test with group greater than one. * Address comment.
This commit is contained in:
		
							parent
							
								
									0564c0eaef
								
							
						
					
					
						commit
						0272451521
					
				|  | @ -628,6 +628,10 @@ void ONNXConvNoBiasOp::inferShapes() { | ||||||
|   auto dataShape = dataTy.getShape(); |   auto dataShape = dataTy.getShape(); | ||||||
|   auto weightShape = weightTy.getShape(); |   auto weightShape = weightTy.getShape(); | ||||||
| 
 | 
 | ||||||
|  |   // Lowest ranked input supported is of shape (N x C x H x W).
 | ||||||
|  |   if (dataShape.size() < 4) | ||||||
|  |      emitError("Data input shape must be at least (NxCxHxW)."); | ||||||
|  | 
 | ||||||
|   // Check that shape of weight and data have same length.
 |   // Check that shape of weight and data have same length.
 | ||||||
|   if (dataShape.size() != weightShape.size()) |   if (dataShape.size() != weightShape.size()) | ||||||
|     emitError("Weight size not compatible with data size."); |     emitError("Weight size not compatible with data size."); | ||||||
|  |  | ||||||
|  | @ -130,6 +130,78 @@ static bool checkInsertDealloc(Operation *currentOp) { | ||||||
|   return insertDealloc; |   return insertDealloc; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // Add bounds associated with the op operand to the KRNL iteration pack.
 | ||||||
|  | // Dynamic dimenions are supported.
 | ||||||
|  | static void addDimensionToPack(ConversionPatternRewriter &rewriter, | ||||||
|  |     Location loc, KrnlIterateOperandPack &pack, Value operand, int index) { | ||||||
|  |   auto shape = operand.getType().cast<MemRefType>().getShape(); | ||||||
|  |   if (shape[index] < 0) { | ||||||
|  |     pack.pushConstantBound(0); | ||||||
|  |     pack.pushOperandBound( | ||||||
|  |         rewriter.create<DimOp>(loc, operand, index).getResult()); | ||||||
|  |   } else { | ||||||
|  |     pack.pushConstantBound(0); | ||||||
|  |     pack.pushConstantBound(shape[index]); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Function that defines the KRNL dialect loops and their respective
 | ||||||
|  | // optimized version.
 | ||||||
|  | static KrnlOptimizeLoopsOp emitOptimizedLoops( | ||||||
|  |     ConversionPatternRewriter &rewriter, Location loc, | ||||||
|  |     std::vector<Value> &loops, std::vector<Value> &optimizedLoops, | ||||||
|  |     int64_t numLoops) { | ||||||
|  |   // Define loops.
 | ||||||
|  |   auto loopsOp = rewriter.create<KrnlDefineLoopsOp>(loc, numLoops); | ||||||
|  |   loops.reserve(numLoops); | ||||||
|  |   for (auto result : loopsOp.getResults()) | ||||||
|  |     loops.push_back(result); | ||||||
|  | 
 | ||||||
|  |   // Define optimized version of the loops.
 | ||||||
|  |   auto optimizedLoopsOp = rewriter.create<KrnlOptimizeLoopsOp>(loc, numLoops); | ||||||
|  |   optimizedLoops.reserve(numLoops); | ||||||
|  |   for (auto result : optimizedLoopsOp.getResults()) | ||||||
|  |     optimizedLoops.push_back(result); | ||||||
|  | 
 | ||||||
|  |   return optimizedLoopsOp; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Function that emits the loops and their optimized version.
 | ||||||
|  | // The function returns a reference to the inner optimization block.
 | ||||||
|  | static Block* defineLoops(ConversionPatternRewriter &rewriter, | ||||||
|  |     Location loc, std::vector<Value> &loops, | ||||||
|  |     std::vector<Value> &optimizedLoops, int64_t numLoops) { | ||||||
|  |   KrnlOptimizeLoopsOp optimizedLoopsOp = emitOptimizedLoops( | ||||||
|  |       rewriter, loc, loops, optimizedLoops, numLoops); | ||||||
|  |   return &optimizedLoopsOp.region().front(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Function which emits a basic set of loops and optimized loops
 | ||||||
|  | // for a given operation argument. A reference to the loop optimization
 | ||||||
|  | // block is returned in the last argument of the function.
 | ||||||
|  | static void emitKrnlLoopsAndIterationForOperand( | ||||||
|  |     ConversionPatternRewriter &rewriter, Location loc, | ||||||
|  |     Value operand, std::vector<Value> &originalLoops, | ||||||
|  |     KrnlOptimizeLoopsOp &optimizedLoopsOp, KrnlIterateOp &iterateOp) { | ||||||
|  |   // Operand shape.
 | ||||||
|  |   auto shape = operand.getType().cast<MemRefType>().getShape(); | ||||||
|  | 
 | ||||||
|  |   // Number of loops.
 | ||||||
|  |   int64_t rank = shape.size(); | ||||||
|  | 
 | ||||||
|  |   // Define loops and optimized loops.
 | ||||||
|  |   std::vector<Value> optimizedLoops; | ||||||
|  |   optimizedLoopsOp = emitOptimizedLoops(rewriter, loc, originalLoops, | ||||||
|  |       optimizedLoops, rank); | ||||||
|  | 
 | ||||||
|  |   KrnlIterateOperandPack pack(rewriter, originalLoops, optimizedLoops); | ||||||
|  |   // Iterate over the loop nest.
 | ||||||
|  |   for (int i = 0; i < rank; ++i) | ||||||
|  |     addDimensionToPack(rewriter, loc, pack, operand, i); | ||||||
|  | 
 | ||||||
|  |   iterateOp = rewriter.create<KrnlIterateOp>(loc, pack); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| unsigned getMemRefEltSizeInBytes(MemRefType memRefType) { | unsigned getMemRefEltSizeInBytes(MemRefType memRefType) { | ||||||
|   auto elementType = memRefType.getElementType(); |   auto elementType = memRefType.getElementType(); | ||||||
| 
 | 
 | ||||||
|  | @ -749,55 +821,21 @@ struct ONNXElementwiseUnaryOpLowering : public ConversionPattern { | ||||||
|       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, |       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, | ||||||
|                                     {operands[0]}); |                                     {operands[0]}); | ||||||
| 
 | 
 | ||||||
|     // Number of loops
 |  | ||||||
|     auto memRefShape = memRefType.getShape(); |  | ||||||
|     int64_t rank = memRefShape.size(); |  | ||||||
| 
 |  | ||||||
|     // Define loops.
 |  | ||||||
|     auto loopsOp = rewriter.create<KrnlDefineLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> originalLoops; |     std::vector<Value> originalLoops; | ||||||
|     originalLoops.reserve(rank); |     KrnlOptimizeLoopsOp optimizedLoopsOp; | ||||||
|     for (auto result : loopsOp.getResults()) { |     KrnlIterateOp iterateOp; | ||||||
|       originalLoops.push_back(result); |     emitKrnlLoopsAndIterationForOperand( | ||||||
|     } |         rewriter, loc, operands[0], originalLoops, | ||||||
| 
 |         optimizedLoopsOp, iterateOp); | ||||||
|     // Define loop optimization.
 |  | ||||||
|     auto optimizedLoopsOp = rewriter.create<KrnlOptimizeLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> optimizedLoops; |  | ||||||
|     optimizedLoops.reserve(rank); |  | ||||||
|     for (auto result : optimizedLoopsOp.getResults()) { |  | ||||||
|       optimizedLoops.push_back(result); |  | ||||||
|     } |  | ||||||
|     Block &optimizationBlock = optimizedLoopsOp.region().front(); |     Block &optimizationBlock = optimizedLoopsOp.region().front(); | ||||||
| 
 |  | ||||||
|     KrnlIterateOperandPack pack(rewriter, originalLoops, optimizedLoops); |  | ||||||
|     // Iterate over the loop nest.
 |  | ||||||
|     // TODO (Tian): move this logic inside KrnlIterateOp. Pass MemRefShape
 |  | ||||||
|     // to KrnlIterateOp instead.
 |  | ||||||
|     for (int i = 0; i < rank; ++i) { |  | ||||||
|       if (memRefShape[i] < 0) { |  | ||||||
|         pack.pushConstantBound(0); |  | ||||||
|         pack.pushOperandBound( |  | ||||||
|             rewriter.create<DimOp>(loc, operands[0], i).getResult()); |  | ||||||
|       } else { |  | ||||||
|         pack.pushConstantBound(0); |  | ||||||
|         pack.pushConstantBound(memRefShape[i]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack); |  | ||||||
|     Block &iterationBlock = iterateOp.bodyRegion().front(); |     Block &iterationBlock = iterateOp.bodyRegion().front(); | ||||||
| 
 | 
 | ||||||
|     // Now perform the insertions into the body of the
 |  | ||||||
|     // just generated instructions:
 |  | ||||||
| 
 |  | ||||||
|     // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
 |     // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
 | ||||||
|     rewriter.setInsertionPointToEnd(&optimizationBlock); |     rewriter.setInsertionPointToEnd(&optimizationBlock); | ||||||
|     // Return from KrnlOptimizeLoopsOp body.
 |     // Return from KrnlOptimizeLoopsOp body.
 | ||||||
|     // When no optimizations are present we just return the loops
 |     // When no optimizations are present we just return the loops
 | ||||||
|     // unchaged.
 |     // unchaged.
 | ||||||
|     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); |     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); | ||||||
|     rewriter.setInsertionPoint(optimizedLoopsOp); |  | ||||||
| 
 | 
 | ||||||
|     // 2. Insert instructions inside the KernelIterateOp body.
 |     // 2. Insert instructions inside the KernelIterateOp body.
 | ||||||
|     rewriter.setInsertionPointToStart(&iterationBlock); |     rewriter.setInsertionPointToStart(&iterationBlock); | ||||||
|  | @ -851,59 +889,25 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern { | ||||||
|       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, |       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, | ||||||
|                                     operands); |                                     operands); | ||||||
| 
 | 
 | ||||||
|     // Number of loops
 |  | ||||||
|     auto memRefShape = memRefType.getShape(); |  | ||||||
|     int64_t rank = memRefShape.size(); |  | ||||||
| 
 |  | ||||||
|     // Define loops.
 |  | ||||||
|     auto loopsOp = rewriter.create<KrnlDefineLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> originalLoops; |  | ||||||
|     originalLoops.reserve(rank); |  | ||||||
|     for (auto result : loopsOp.getResults()) { |  | ||||||
|       originalLoops.push_back(result); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // Define loop optimization.
 |  | ||||||
|     auto optimizedLoopsOp = rewriter.create<KrnlOptimizeLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> optimizedLoops; |  | ||||||
|     optimizedLoops.reserve(rank); |  | ||||||
|     for (auto result : optimizedLoopsOp.getResults()) { |  | ||||||
|       optimizedLoops.push_back(result); |  | ||||||
|     } |  | ||||||
|     Block &optimizationBlock = optimizedLoopsOp.region().front(); |  | ||||||
| 
 |  | ||||||
|     KrnlIterateOperandPack pack(rewriter, originalLoops, optimizedLoops); |  | ||||||
|     // Iterate over the loop nest.
 |  | ||||||
|     // TODO (Tian): move this logic inside KrnlIterateOp. Pass MemRefShape
 |  | ||||||
|     // to KrnlIterateOp instead.
 |  | ||||||
|     for (int i = 0; i < rank; ++i) { |  | ||||||
|       if (memRefShape[i] < 0) { |  | ||||||
|         pack.pushConstantBound(0); |  | ||||||
|         pack.pushOperandBound( |  | ||||||
|             rewriter.create<DimOp>(loc, alloc, i).getResult()); |  | ||||||
|       } else { |  | ||||||
|         pack.pushConstantBound(0); |  | ||||||
|         pack.pushConstantBound(memRefShape[i]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // Get run-time dimension information for unknown dimensions used for
 |     // Get run-time dimension information for unknown dimensions used for
 | ||||||
|     // broadcasting.
 |     // broadcasting.
 | ||||||
|     std::map<int, std::map<int, Value>> broadcastedDimInfo = |     std::map<int, std::map<int, Value>> broadcastedDimInfo = | ||||||
|         getBroadcastedDimInfo(loc, rewriter, memRefType, operands); |         getBroadcastedDimInfo(loc, rewriter, memRefType, operands); | ||||||
| 
 | 
 | ||||||
|     auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack); |     std::vector<Value> originalLoops; | ||||||
|  |     KrnlOptimizeLoopsOp optimizedLoopsOp; | ||||||
|  |     KrnlIterateOp iterateOp; | ||||||
|  |     emitKrnlLoopsAndIterationForOperand( | ||||||
|  |         rewriter, loc, alloc, originalLoops, | ||||||
|  |         optimizedLoopsOp, iterateOp); | ||||||
|  |     Block &optimizationBlock = optimizedLoopsOp.region().front(); | ||||||
|     Block &iterationBlock = iterateOp.bodyRegion().front(); |     Block &iterationBlock = iterateOp.bodyRegion().front(); | ||||||
| 
 | 
 | ||||||
|     // Now perform the insertions into the body of the
 |  | ||||||
|     // just generated instructions:
 |  | ||||||
| 
 |  | ||||||
|     // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
 |     // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
 | ||||||
|     rewriter.setInsertionPointToEnd(&optimizationBlock); |     rewriter.setInsertionPointToEnd(&optimizationBlock); | ||||||
|     // Return from KrnlOptimizeLoopsOp body.
 |     // Return from KrnlOptimizeLoopsOp body.
 | ||||||
|     // When no optimizations are present we just return the loops unchaged.
 |     // When no optimizations are present we just return the loops unchaged.
 | ||||||
|     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); |     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); | ||||||
|     rewriter.setInsertionPoint(optimizedLoopsOp); |  | ||||||
| 
 | 
 | ||||||
|     // 2. Insert instructions inside the KernelIterateOp body.
 |     // 2. Insert instructions inside the KernelIterateOp body.
 | ||||||
|     rewriter.setInsertionPointToStart(&iterationBlock); |     rewriter.setInsertionPointToStart(&iterationBlock); | ||||||
|  | @ -978,21 +982,10 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern { | ||||||
|         FloatAttr::get(elementType, -std::numeric_limits<float>::infinity())); |         FloatAttr::get(elementType, -std::numeric_limits<float>::infinity())); | ||||||
| 
 | 
 | ||||||
|     // Define loops.
 |     // Define loops.
 | ||||||
|     auto loopsOp = rewriter.create<KrnlDefineLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> originalLoops; |     std::vector<Value> originalLoops; | ||||||
|     originalLoops.reserve(rank); |  | ||||||
|     for (auto result : loopsOp.getResults()) { |  | ||||||
|       originalLoops.push_back(result); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // Define loop optimization.
 |  | ||||||
|     auto optimizedLoopsOp = rewriter.create<KrnlOptimizeLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> optimizedLoops; |     std::vector<Value> optimizedLoops; | ||||||
|     optimizedLoops.reserve(rank); |     Block *optimizationBlock = defineLoops(rewriter, loc, originalLoops, | ||||||
|     for (auto result : optimizedLoopsOp.getResults()) { |             optimizedLoops, rank); | ||||||
|       optimizedLoops.push_back(result); |  | ||||||
|     } |  | ||||||
|     Block &optimizationBlock = optimizedLoopsOp.region().front(); |  | ||||||
| 
 | 
 | ||||||
|     // Coerce the input into a 2-D tensor. `axis` will be the coercing point.
 |     // Coerce the input into a 2-D tensor. `axis` will be the coercing point.
 | ||||||
|     // This coercing follows the softmax definition in ONNX:
 |     // This coercing follows the softmax definition in ONNX:
 | ||||||
|  | @ -1009,16 +1002,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern { | ||||||
|       optimizedOuterLoops.push_back(optimizedLoops[i]); |       optimizedOuterLoops.push_back(optimizedLoops[i]); | ||||||
|     } |     } | ||||||
|     KrnlIterateOperandPack outerPack(rewriter, outerLoops, optimizedOuterLoops); |     KrnlIterateOperandPack outerPack(rewriter, outerLoops, optimizedOuterLoops); | ||||||
|     for (int i = 0; i < axis; ++i) { |     for (int i = 0; i < axis; ++i) | ||||||
|       if (memRefShape[i] < 0) { |       addDimensionToPack(rewriter, loc, outerPack, operands[0], i); | ||||||
|         outerPack.pushConstantBound(0); | 
 | ||||||
|         outerPack.pushOperandBound( |  | ||||||
|             rewriter.create<DimOp>(loc, operands[0], i).getResult()); |  | ||||||
|       } else { |  | ||||||
|         outerPack.pushConstantBound(0); |  | ||||||
|         outerPack.pushConstantBound(memRefShape[i]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     // Define an inner loop with respect to axis.
 |     // Define an inner loop with respect to axis.
 | ||||||
|     std::vector<Value> innerLoops, optimizedInnerLoops; |     std::vector<Value> innerLoops, optimizedInnerLoops; | ||||||
|     innerLoops.reserve(rank - axis); |     innerLoops.reserve(rank - axis); | ||||||
|  | @ -1028,16 +1014,8 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern { | ||||||
|       optimizedInnerLoops.push_back(optimizedLoops[i]); |       optimizedInnerLoops.push_back(optimizedLoops[i]); | ||||||
|     } |     } | ||||||
|     KrnlIterateOperandPack innerPack(rewriter, innerLoops, optimizedInnerLoops); |     KrnlIterateOperandPack innerPack(rewriter, innerLoops, optimizedInnerLoops); | ||||||
|     for (int i = axis; i < rank; ++i) { |     for (int i = axis; i < rank; ++i) | ||||||
|       if (memRefShape[i] < 0) { |       addDimensionToPack(rewriter, loc, innerPack, operands[0], i); | ||||||
|         innerPack.pushConstantBound(0); |  | ||||||
|         innerPack.pushOperandBound( |  | ||||||
|             rewriter.create<DimOp>(loc, operands[0], i).getResult()); |  | ||||||
|       } else { |  | ||||||
|         innerPack.pushConstantBound(0); |  | ||||||
|         innerPack.pushConstantBound(memRefShape[i]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| 
 | 
 | ||||||
|     KrnlIterateOp outerIterateOp, maxIterateOp, sumIterateOp, softmaxIterateOp; |     KrnlIterateOp outerIterateOp, maxIterateOp, sumIterateOp, softmaxIterateOp; | ||||||
|     SmallVector<Value, 4> outerLoopIVs; |     SmallVector<Value, 4> outerLoopIVs; | ||||||
|  | @ -1045,9 +1023,8 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern { | ||||||
|       outerIterateOp = rewriter.create<KrnlIterateOp>(loc, outerPack); |       outerIterateOp = rewriter.create<KrnlIterateOp>(loc, outerPack); | ||||||
| 
 | 
 | ||||||
|       // No optimization
 |       // No optimization
 | ||||||
|       rewriter.setInsertionPointToEnd(&optimizationBlock); |       rewriter.setInsertionPointToEnd(optimizationBlock); | ||||||
|       rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); |       rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); | ||||||
|       rewriter.setInsertionPoint(optimizedLoopsOp); |  | ||||||
| 
 | 
 | ||||||
|       // Insert instructions inside the outer loop.
 |       // Insert instructions inside the outer loop.
 | ||||||
|       Block &outerIterationBlock = outerIterateOp.bodyRegion().front(); |       Block &outerIterationBlock = outerIterateOp.bodyRegion().front(); | ||||||
|  | @ -1078,9 +1055,8 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern { | ||||||
|       softmaxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack); |       softmaxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack); | ||||||
| 
 | 
 | ||||||
|       // No optimization
 |       // No optimization
 | ||||||
|       rewriter.setInsertionPointToEnd(&optimizationBlock); |       rewriter.setInsertionPointToEnd(optimizationBlock); | ||||||
|       rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); |       rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); | ||||||
|       rewriter.setInsertionPoint(optimizedLoopsOp); |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // Insert instructions inside the max loop.
 |     // Insert instructions inside the max loop.
 | ||||||
|  | @ -1291,20 +1267,10 @@ struct ONNXGemmOpLowering : public ConversionPattern { | ||||||
|     int64_t numLoops = 3; |     int64_t numLoops = 3; | ||||||
| 
 | 
 | ||||||
|     // Define loops.
 |     // Define loops.
 | ||||||
|     auto loopsOp = rewriter.create<KrnlDefineLoopsOp>(loc, numLoops); |  | ||||||
|     std::vector<Value> originalLoops; |     std::vector<Value> originalLoops; | ||||||
|     originalLoops.reserve(numLoops); |  | ||||||
|     for (auto result : loopsOp.getResults()) { |  | ||||||
|       originalLoops.push_back(result); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     auto optimizedLoopsOp = rewriter.create<KrnlOptimizeLoopsOp>(loc, numLoops); |  | ||||||
|     std::vector<Value> optimizedLoops; |     std::vector<Value> optimizedLoops; | ||||||
|     optimizedLoops.reserve(numLoops); |     Block *optimizationBlock = defineLoops(rewriter, loc, originalLoops, | ||||||
|     for (auto result : optimizedLoopsOp.getResults()) { |             optimizedLoops, numLoops); | ||||||
|       optimizedLoops.push_back(result); |  | ||||||
|     } |  | ||||||
|     Block &optimizationBlock = optimizedLoopsOp.region().front(); |  | ||||||
| 
 | 
 | ||||||
|     // We have two Krnl loops:
 |     // We have two Krnl loops:
 | ||||||
|     // - Outer loop iterates over the output matrix dimensions, and
 |     // - Outer loop iterates over the output matrix dimensions, and
 | ||||||
|  | @ -1321,16 +1287,9 @@ struct ONNXGemmOpLowering : public ConversionPattern { | ||||||
|     KrnlIterateOperandPack outerPack(rewriter, outerLoops, |     KrnlIterateOperandPack outerPack(rewriter, outerLoops, | ||||||
|                                       optimizedOuterLoops); |                                       optimizedOuterLoops); | ||||||
|     // Induction variables for the outer loops
 |     // Induction variables for the outer loops
 | ||||||
|     for (int i = 0; i < 2; ++i) { |     for (int i = 0; i < 2; ++i) | ||||||
|       if (memRefShape[i] < 0) { |       addDimensionToPack(rewriter, loc, outerPack, alloc, i); | ||||||
|         outerPack.pushConstantBound(0); | 
 | ||||||
|         outerPack.pushOperandBound( |  | ||||||
|             rewriter.create<DimOp>(loc, alloc, i).getResult()); |  | ||||||
|       } else { |  | ||||||
|         outerPack.pushConstantBound(0); |  | ||||||
|         outerPack.pushConstantBound(memRefShape[i]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     // Reduction loop
 |     // Reduction loop
 | ||||||
|     std::vector<Value> reductionLoops, optimizedReductionLoops; |     std::vector<Value> reductionLoops, optimizedReductionLoops; | ||||||
|     reductionLoops.reserve(1); |     reductionLoops.reserve(1); | ||||||
|  | @ -1378,9 +1337,8 @@ struct ONNXGemmOpLowering : public ConversionPattern { | ||||||
|     // just generated instructions:
 |     // just generated instructions:
 | ||||||
| 
 | 
 | ||||||
|     // No optimization
 |     // No optimization
 | ||||||
|     rewriter.setInsertionPointToEnd(&optimizationBlock); |     rewriter.setInsertionPointToEnd(optimizationBlock); | ||||||
|     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); |     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); | ||||||
|     rewriter.setInsertionPoint(optimizedLoopsOp); |  | ||||||
| 
 | 
 | ||||||
|     // Insert instructions inside the outer loop.
 |     // Insert instructions inside the outer loop.
 | ||||||
|     Block &outerIterationBlock = outerIterateOp.bodyRegion().front(); |     Block &outerIterationBlock = outerIterateOp.bodyRegion().front(); | ||||||
|  | @ -1544,36 +1502,15 @@ struct ONNXTransposeOpLowering : public ConversionPattern { | ||||||
|     int64_t rank = memRefShape.size(); |     int64_t rank = memRefShape.size(); | ||||||
| 
 | 
 | ||||||
|     // Define loops.
 |     // Define loops.
 | ||||||
|     auto loopsOp = rewriter.create<KrnlDefineLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> originalLoops; |     std::vector<Value> originalLoops; | ||||||
|     originalLoops.reserve(rank); |  | ||||||
| 
 |  | ||||||
|     for (auto result : loopsOp.getResults()) { |  | ||||||
|       originalLoops.push_back(result); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // Define loop optimization.
 |  | ||||||
|     auto optimizedLoopsOp = rewriter.create<KrnlOptimizeLoopsOp>(loc, rank); |  | ||||||
|     std::vector<Value> optimizedLoops; |     std::vector<Value> optimizedLoops; | ||||||
|     optimizedLoops.reserve(rank); |     Block *optimizationBlock = defineLoops(rewriter, loc, originalLoops, | ||||||
|  |         optimizedLoops, rank); | ||||||
| 
 | 
 | ||||||
|     for (auto result : optimizedLoopsOp.getResults()) { |  | ||||||
|       optimizedLoops.push_back(result); |  | ||||||
|     } |  | ||||||
|     Block &optimizationBlock = optimizedLoopsOp.region().front(); |  | ||||||
|     KrnlIterateOperandPack pack(rewriter, originalLoops, optimizedLoops); |     KrnlIterateOperandPack pack(rewriter, originalLoops, optimizedLoops); | ||||||
|     // Iterate over the loop nest using the input shape.
 |     // Iterate over the loop nest using the input shape.
 | ||||||
|     auto inputShape = operands[0].getType().cast<MemRefType>().getShape(); |     for (int i = 0; i < rank; ++i) | ||||||
|     for (int i = 0; i < rank; ++i) { |       addDimensionToPack(rewriter, loc, pack, operands[0], i); | ||||||
|       if (inputShape[i] < 0) { |  | ||||||
|         pack.pushConstantBound(0); |  | ||||||
|         pack.pushOperandBound( |  | ||||||
|             rewriter.create<DimOp>(loc, operands[0], i).getResult()); |  | ||||||
|       } else { |  | ||||||
|         pack.pushConstantBound(0); |  | ||||||
|         pack.pushConstantBound(inputShape[i]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| 
 | 
 | ||||||
|     auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack); |     auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack); | ||||||
|     Block &iterationBlock = iterateOp.bodyRegion().front(); |     Block &iterationBlock = iterateOp.bodyRegion().front(); | ||||||
|  | @ -1582,12 +1519,11 @@ struct ONNXTransposeOpLowering : public ConversionPattern { | ||||||
|     // just generated instructions:
 |     // just generated instructions:
 | ||||||
| 
 | 
 | ||||||
|     // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
 |     // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
 | ||||||
|     rewriter.setInsertionPointToEnd(&optimizationBlock); |     rewriter.setInsertionPointToEnd(optimizationBlock); | ||||||
|     // Return from KrnlOptimizeLoopsOp body.
 |     // Return from KrnlOptimizeLoopsOp body.
 | ||||||
|     // When no optimizations are present we just return the loops
 |     // When no optimizations are present we just return the loops
 | ||||||
|     // unchaged.
 |     // unchaged.
 | ||||||
|     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); |     rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops); | ||||||
|     rewriter.setInsertionPoint(optimizedLoopsOp); |  | ||||||
| 
 | 
 | ||||||
|     // 2. Insert instructions inside the KernelIterateOp body.
 |     // 2. Insert instructions inside the KernelIterateOp body.
 | ||||||
|     rewriter.setInsertionPointToStart(&iterationBlock); |     rewriter.setInsertionPointToStart(&iterationBlock); | ||||||
|  | @ -1638,6 +1574,255 @@ struct ONNXIdentityOpLowering : public ConversionPattern { | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct ONNXConvNoBiasOpLowering : public ConversionPattern { | ||||||
|  |   ONNXConvNoBiasOpLowering(MLIRContext *ctx) | ||||||
|  |       : ConversionPattern(mlir::ONNXConvNoBiasOp::getOperationName(), 1, ctx) {} | ||||||
|  | 
 | ||||||
|  |   PatternMatchResult | ||||||
|  |   matchAndRewrite(Operation *op, ArrayRef<Value> operands, | ||||||
|  |                   ConversionPatternRewriter &rewriter) const final { | ||||||
|  |     auto tensorType = (*op->result_type_begin()).cast<TensorType>(); | ||||||
|  |     auto loc = op->getLoc(); | ||||||
|  |     // Insert an allocation and deallocation for the result of this operation.
 | ||||||
|  |     auto memRefType = convertTensorToMemRef(tensorType); | ||||||
|  |     Value alloc; | ||||||
|  |     bool insertDealloc = checkInsertDealloc(op); | ||||||
|  |     ONNXConvNoBiasOp convOp = llvm::dyn_cast<ONNXConvNoBiasOp>(op); | ||||||
|  | 
 | ||||||
|  |     if (hasAllConstantDimensions(memRefType)) | ||||||
|  |       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc); | ||||||
|  |     else | ||||||
|  |       alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, | ||||||
|  |                                     {operands[0]}); | ||||||
|  | 
 | ||||||
|  |     auto resultShape = memRefType.getShape(); | ||||||
|  |     auto inputShape = operands[0].getType().cast<MemRefType>().getShape(); | ||||||
|  |     auto kernelShape = operands[1].getType().cast<MemRefType>().getShape(); | ||||||
|  | 
 | ||||||
|  |     // R = ConvNoBias(D, K)
 | ||||||
|  |     //
 | ||||||
|  |     // The input/output shapes will look like this:
 | ||||||
|  |     //
 | ||||||
|  |     // D (NxCxHxW) x K (MxC/groupxKHxKW) -> R (NxMxRHxRW)
 | ||||||
|  |     //
 | ||||||
|  |     // M is a multiple of the number of groups:
 | ||||||
|  |     //   M = group * kernelsPerGroup
 | ||||||
|  |     //
 | ||||||
|  |     // The loop nest will look as follows:
 | ||||||
|  |     //
 | ||||||
|  |     // kernelsPerGroup = M / group;
 | ||||||
|  |     // for n = 0 .. N:
 | ||||||
|  |     //   for g = 0 .. group:
 | ||||||
|  |     //     for m = 0 .. kernelsPerGroup:
 | ||||||
|  |     //       kernel = g * kernelsPerGroup + m;
 | ||||||
|  |     //       for r1 = 0 .. RH:
 | ||||||
|  |     //         for r2 = 0 .. RW:
 | ||||||
|  |     //           R[n][kernel][r1][r2] = 0;
 | ||||||
|  |     //           for c = 0 .. C/group:
 | ||||||
|  |     //             for k1 = 0 .. KH:
 | ||||||
|  |     //               for k2 = 0 .. KW:
 | ||||||
|  |     //                 R[n][kernel][r1][r2] =
 | ||||||
|  |     //                   D[n][g * (C / group) + c][r1 + k1][r2 + k2] *
 | ||||||
|  |     //                   K[kernel][c][k1][k2];
 | ||||||
|  |     //
 | ||||||
|  |     // TODO: handle padding.
 | ||||||
|  |     //
 | ||||||
|  |     // In the general case:
 | ||||||
|  |     //
 | ||||||
|  |     // D (NxCxD1xD2x...xDdim) x K (MxC/groupxK1xK2x...xKdim)
 | ||||||
|  |     //     -> R (NxMxR1xR2x...xRdim)
 | ||||||
|  |     //
 | ||||||
|  |     // The above loop nest can be adapted by increasing the number
 | ||||||
|  |     // of r- and k-index loop i.e. r1 r2 and k1 k2 loops.
 | ||||||
|  | 
 | ||||||
|  |     // Set up outermost loops: n g m r1 r2 ... rdim
 | ||||||
|  |     // Skip g if group is 1.
 | ||||||
|  | 
 | ||||||
|  |     // Before we start the iteration we need to compute the number of
 | ||||||
|  |     // unsplit kernels and fetch the number of groups from the attribute
 | ||||||
|  |     // list. Group is always a compilation constant.
 | ||||||
|  |     int64_t group = convOp.group().getSExtValue(); | ||||||
|  |     // Compute the number of unsplit kernels. The number of kernels
 | ||||||
|  |     // must be a multiple of the number of groups.
 | ||||||
|  |     int64_t kernelsPerGroup = floor(kernelShape[0] / group); | ||||||
|  |     auto kernelsPerGroupValue = | ||||||
|  |         rewriter.create<ConstantIndexOp>(loc, kernelsPerGroup); | ||||||
|  |     auto zero = rewriter.create<ConstantOp>( | ||||||
|  |         loc, FloatAttr::get(memRefType.getElementType(), 0)); | ||||||
|  |     Value subchannels; | ||||||
|  |     if (kernelShape[1] < 0) { | ||||||
|  |       subchannels = | ||||||
|  |           rewriter.create<DimOp>(loc, operands[1], 1).getResult(); | ||||||
|  |     } else { | ||||||
|  |       subchannels = rewriter.create<ConstantIndexOp>( | ||||||
|  |           loc, kernelShape[1]); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // 1. Define outer loops and emit empty optimization block:
 | ||||||
|  |     int64_t nOuterLoops = (group > 1) ? 3 : 2; | ||||||
|  |     std::vector<Value> outerLoops; | ||||||
|  |     std::vector<Value> optimizedOuterLoops; | ||||||
|  |     Block *optimizationBlock = defineLoops(rewriter, loc, outerLoops, | ||||||
|  |         optimizedOuterLoops, nOuterLoops); | ||||||
|  | 
 | ||||||
|  |     // Prepare iteration arguments over outer loop nest.
 | ||||||
|  |     KrnlIterateOperandPack pack( | ||||||
|  |         rewriter, outerLoops, optimizedOuterLoops); | ||||||
|  |     //   for n = 0 .. N:
 | ||||||
|  |     pack.pushConstantBound(0); | ||||||
|  |     if (inputShape[0] < 0) | ||||||
|  |       pack.pushOperandBound( | ||||||
|  |           rewriter.create<DimOp>(loc, operands[0], 0).getResult()); | ||||||
|  |     else | ||||||
|  |       pack.pushConstantBound(inputShape[0]); | ||||||
|  |     //   for g = 0 .. N:
 | ||||||
|  |     if (group > 1) { | ||||||
|  |       pack.pushConstantBound(0); | ||||||
|  |       pack.pushConstantBound(group); | ||||||
|  |     } | ||||||
|  |     //   for m = 0 .. kernelsPerGroup:
 | ||||||
|  |     pack.pushConstantBound(0); | ||||||
|  |     pack.pushConstantBound(kernelsPerGroup); | ||||||
|  |     // Outer loop iteration.
 | ||||||
|  |     auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack); | ||||||
|  |     Block &outerIterationBlock = iterateOp.bodyRegion().front(); | ||||||
|  |     // Emit optimizations for outer loops:
 | ||||||
|  |     rewriter.setInsertionPointToEnd(optimizationBlock); | ||||||
|  |     rewriter.create<KrnlReturnLoopsOp>(loc, outerLoops); | ||||||
|  |     rewriter.setInsertionPointToStart(&outerIterationBlock); | ||||||
|  |     { | ||||||
|  |       // 2. Emit the body of the outer loop nest.
 | ||||||
|  | 
 | ||||||
|  |       // 2.1 Compute kernel order number: kernel = g * kernelsPerGroup + m;
 | ||||||
|  |       // If group is not set then the value of the kernel ID is
 | ||||||
|  |       // identical to that of the loop over kernels.
 | ||||||
|  |       Value kernel = outerIterationBlock.getArguments()[1]; | ||||||
|  |       if (group > 1) { | ||||||
|  |         // Middle loop is over groups and third loop is over the
 | ||||||
|  |         // kernel identifiers in the current group.
 | ||||||
|  |         auto kernelsOffset = rewriter.create<MulIOp>(loc, | ||||||
|  |             outerIterationBlock.getArguments()[1], | ||||||
|  |             kernelsPerGroupValue); | ||||||
|  |         kernel = rewriter.create<AddIOp>(loc, kernelsOffset, | ||||||
|  |             outerIterationBlock.getArguments()[2]); | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       // 2.2 Define spatial loops
 | ||||||
|  |       int64_t nSpatialLoops = resultShape.size() - 2; | ||||||
|  |       std::vector<Value> spatialLoops; | ||||||
|  |       std::vector<Value> optimizedSpatialLoops; | ||||||
|  |       Block *optSpatialLoopBlock = defineLoops(rewriter, loc, spatialLoops, | ||||||
|  |         optimizedSpatialLoops, nSpatialLoops); | ||||||
|  | 
 | ||||||
|  |       // 2.3 Prepare iteration arguments for spatial loop nest.
 | ||||||
|  |       KrnlIterateOperandPack spatialPack( | ||||||
|  |         rewriter, spatialLoops, optimizedSpatialLoops); | ||||||
|  |       for (int i = 2; i < resultShape.size(); ++i) | ||||||
|  |         addDimensionToPack(rewriter, loc, spatialPack, alloc, i); | ||||||
|  | 
 | ||||||
|  |       // 2.4 Emit loop nest over output spatial dimensions.
 | ||||||
|  |       //   for rX = 0 .. RX
 | ||||||
|  |       auto spatialIterateOp = | ||||||
|  |           rewriter.create<KrnlIterateOp>(loc, spatialPack); | ||||||
|  |       Block &spatialIterationBlock = spatialIterateOp.bodyRegion().front(); | ||||||
|  |       // 2.5 Emit optimizations for outer loops:
 | ||||||
|  |       rewriter.setInsertionPointToEnd(optSpatialLoopBlock); | ||||||
|  |       rewriter.create<KrnlReturnLoopsOp>(loc, spatialLoops); | ||||||
|  |       rewriter.setInsertionPointToStart(&spatialIterationBlock); | ||||||
|  |       { | ||||||
|  |         // 3. Emit the body of the spatial loop nest.
 | ||||||
|  |         // 3.1 Emit: R[n][kernel][r1][r2] = 0;
 | ||||||
|  |         SmallVector<Value, 4> resultIndices; | ||||||
|  |         // n
 | ||||||
|  |         resultIndices.emplace_back(outerIterationBlock.getArguments()[0]); | ||||||
|  |         // kernel
 | ||||||
|  |         resultIndices.emplace_back(kernel); | ||||||
|  |         // rX
 | ||||||
|  |         for (auto arg : spatialIterationBlock.getArguments()) | ||||||
|  |           resultIndices.emplace_back(arg); | ||||||
|  |         // Store initializer value into output location.
 | ||||||
|  |         rewriter.create<StoreOp>(loc, zero, alloc, resultIndices); | ||||||
|  | 
 | ||||||
|  |         // 3.2 Define inner loops.
 | ||||||
|  |         int64_t nInnerLoops = 1 + (kernelShape.size() - 2); | ||||||
|  |         std::vector<Value> innerLoops; | ||||||
|  |         std::vector<Value> optimizedInnerLoops; | ||||||
|  |         Block *optInnerLoopBlock = defineLoops(rewriter, loc, innerLoops, | ||||||
|  |             optimizedInnerLoops, nInnerLoops); | ||||||
|  | 
 | ||||||
|  |         // 3.3 Prepare iteration arguments for inner loop nest.
 | ||||||
|  |         KrnlIterateOperandPack innerPack( | ||||||
|  |             rewriter, innerLoops, optimizedInnerLoops); | ||||||
|  |         //   for c = 0 .. C/group
 | ||||||
|  |         innerPack.pushConstantBound(0); | ||||||
|  |         innerPack.pushConstantBound(kernelShape[1]); | ||||||
|  |         //   for Kx = 0 .. KX
 | ||||||
|  |         for (int i = 2; i < kernelShape.size(); ++i) | ||||||
|  |           addDimensionToPack(rewriter, loc, innerPack, operands[1], i); | ||||||
|  | 
 | ||||||
|  |         // 3.4 Emit inner loop nest.
 | ||||||
|  |         auto innerIterateOp = | ||||||
|  |             rewriter.create<KrnlIterateOp>(loc, innerPack); | ||||||
|  |         Block &innerIterationBlock = innerIterateOp.bodyRegion().front(); | ||||||
|  |         // 3.5 Emit optimizations for outer loops:
 | ||||||
|  |         rewriter.setInsertionPointToEnd(optInnerLoopBlock); | ||||||
|  |         rewriter.create<KrnlReturnLoopsOp>(loc, innerLoops); | ||||||
|  |         rewriter.setInsertionPointToStart(&innerIterationBlock); | ||||||
|  |         { | ||||||
|  |           // 4. Emit inner loop body
 | ||||||
|  |           // R[n][kernel][r1][r2] =
 | ||||||
|  |           //   D[n][g * (C / group) + c][r1 + k1][r2 + k2] *
 | ||||||
|  |           //   K[kernel][c][k1][k2];
 | ||||||
|  | 
 | ||||||
|  |           // 4.1 Prepare indices for accesing the data tensor.
 | ||||||
|  |           SmallVector<Value, 4> dataIndices; | ||||||
|  |           // n
 | ||||||
|  |           dataIndices.emplace_back(outerIterationBlock.getArguments()[0]); | ||||||
|  |           // g * (C / group) + c
 | ||||||
|  |           Value channelDepth = innerIterationBlock.getArguments()[0]; | ||||||
|  |           if (group > 1) | ||||||
|  |             channelDepth = rewriter.create<AddIOp>(loc, channelDepth, | ||||||
|  |                 rewriter.create<MulIOp>(loc, subchannels, | ||||||
|  |                     outerIterationBlock.getArguments()[1])); | ||||||
|  |           dataIndices.emplace_back(channelDepth); | ||||||
|  |           // rX + kX
 | ||||||
|  |           for (int i = 0; i < kernelShape.size() - 2; ++i) | ||||||
|  |             dataIndices.emplace_back( | ||||||
|  |                 rewriter.create<AddIOp>(loc, | ||||||
|  |                     spatialIterationBlock.getArguments()[i], | ||||||
|  |                     innerIterationBlock.getArguments()[i+1])); | ||||||
|  | 
 | ||||||
|  |           // 4.2 Prepare indices for accessing the kernel tensor.
 | ||||||
|  |           SmallVector<Value, 4> kernelIndices; | ||||||
|  |           // kernel
 | ||||||
|  |           kernelIndices.emplace_back(kernel); | ||||||
|  |           // c
 | ||||||
|  |           kernelIndices.emplace_back(innerIterationBlock.getArguments()[0]); | ||||||
|  |           // kX
 | ||||||
|  |           for (int i = 0; i < kernelShape.size() - 2; ++i) | ||||||
|  |             kernelIndices.emplace_back( | ||||||
|  |                 innerIterationBlock.getArguments()[i+1]); | ||||||
|  | 
 | ||||||
|  |           // 4.3 Compute convolution.
 | ||||||
|  |           auto loadData = | ||||||
|  |               rewriter.create<LoadOp>(loc, operands[0], dataIndices); | ||||||
|  |           auto loadKernel = | ||||||
|  |               rewriter.create<LoadOp>(loc, operands[1], kernelIndices); | ||||||
|  |           auto loadPartialSum = | ||||||
|  |               rewriter.create<LoadOp>(loc, alloc, resultIndices); | ||||||
|  |           Value result = rewriter.create<AddFOp>(loc, loadPartialSum, | ||||||
|  |               rewriter.create<MulFOp>(loc, loadData, loadKernel)); | ||||||
|  |           // 4.4 Store computed value into output location.
 | ||||||
|  |           rewriter.create<StoreOp>(loc, result, alloc, resultIndices); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     rewriter.replaceOp(op, alloc); | ||||||
|  | 
 | ||||||
|  |     return matchSuccess(); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| //===----------------------------------------------------------------------===//
 | //===----------------------------------------------------------------------===//
 | ||||||
| // EntryPoint Op lowering to Krnl Entry Point.
 | // EntryPoint Op lowering to Krnl Entry Point.
 | ||||||
| //===----------------------------------------------------------------------===//
 | //===----------------------------------------------------------------------===//
 | ||||||
|  | @ -1769,7 +1954,8 @@ void FrontendToKrnlLoweringPass::runOnModule() { | ||||||
|                   ONNXReshapeOpLowering, ONNXEntryPointLowering, |                   ONNXReshapeOpLowering, ONNXEntryPointLowering, | ||||||
|                   ONNXSoftmaxOpLowering, ONNXGemmOpLowering, |                   ONNXSoftmaxOpLowering, ONNXGemmOpLowering, | ||||||
|                   ONNXUnsqueezeOpLowering, ONNXTransposeOpLowering, |                   ONNXUnsqueezeOpLowering, ONNXTransposeOpLowering, | ||||||
|                   ONNXIdentityOpLowering>(&getContext()); |                   ONNXIdentityOpLowering, ONNXConvNoBiasOpLowering | ||||||
|  |                   >(&getContext()); | ||||||
| 
 | 
 | ||||||
|   // With the target and rewrite patterns defined, we can now attempt the
 |   // With the target and rewrite patterns defined, we can now attempt the
 | ||||||
|   // conversion. The conversion will signal failure if any of our `illegal`
 |   // conversion. The conversion will signal failure if any of our `illegal`
 | ||||||
|  |  | ||||||
|  | @ -202,6 +202,9 @@ test_to_enable = [ | ||||||
|     "test_transpose_all_permutations_4_cpu", |     "test_transpose_all_permutations_4_cpu", | ||||||
|     "test_transpose_all_permutations_5_cpu", |     "test_transpose_all_permutations_5_cpu", | ||||||
| 
 | 
 | ||||||
|  |     # Conv | ||||||
|  |     "test_basic_conv_without_padding_cpu", | ||||||
|  | 
 | ||||||
|     # Sign Op: |     # Sign Op: | ||||||
|     "test_sign_cpu", |     "test_sign_cpu", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | @ -568,15 +568,15 @@ func @test_add_with_broadcasting(%arg0 : tensor<?xf32>, %arg1 : tensor<?x10xf32> | ||||||
|   // CHECK-LABEL: test_add_with_broadcasting |   // CHECK-LABEL: test_add_with_broadcasting | ||||||
|   // CHECK: [[DIM1:%.+]] = dim %arg1, 0 : memref<?x10xf32> |   // CHECK: [[DIM1:%.+]] = dim %arg1, 0 : memref<?x10xf32> | ||||||
|   // CHECK: [[RES:%.+]] = alloc([[DIM1]]) : memref<?x10xf32> |   // CHECK: [[RES:%.+]] = alloc([[DIM1]]) : memref<?x10xf32> | ||||||
|  |   // CHECK: [[DIM2:%.+]] = dim %arg0, 0 : memref<?xf32> | ||||||
|  |   // CHECK: [[ONE:%.+]] = constant 1 : index | ||||||
|  |   // CHECK: [[IS_ONE:%.+]] = cmpi "eq", [[DIM2]], [[ONE]] : index | ||||||
|   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2 |   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||||
|   // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  { |   // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||||
|   // CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1 |   // CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1 | ||||||
|   // CHECK: } : () -> (!krnl.loop, !krnl.loop) |   // CHECK: } : () -> (!krnl.loop, !krnl.loop) | ||||||
|   // CHECK: [[DIM2:%.+]] = dim [[RES]], 0 : memref<?x10xf32> |   // CHECK: [[DIM3:%.+]] = dim [[RES]], 0 : memref<?x10xf32> | ||||||
|   // CHECK: [[DIM3:%.+]] = dim %arg0, 0 : memref<?xf32> |   // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM3]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) { | ||||||
|   // CHECK: [[ONE:%.+]] = constant 1 : index |  | ||||||
|   // CHECK: [[IS_ONE:%.+]] = cmpi "eq", [[DIM3]], [[ONE]] : index |  | ||||||
|   // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) { |  | ||||||
|   // CHECK: [[ZERO:%.+]] = constant 0 : index |   // CHECK: [[ZERO:%.+]] = constant 0 : index | ||||||
|   // CHECK: %[[SELECT1:.+]] = select [[IS_ONE]], [[ZERO]], %arg3 : index |   // CHECK: %[[SELECT1:.+]] = select [[IS_ONE]], [[ZERO]], %arg3 : index | ||||||
|   // CHECK: [[LOAD1:%.+]] = load %arg0[%[[SELECT1]]] : memref<?xf32> |   // CHECK: [[LOAD1:%.+]] = load %arg0[%[[SELECT1]]] : memref<?xf32> | ||||||
|  | @ -788,3 +788,93 @@ func @test_sign_i(%arg0 : tensor<?x10xi32>) -> tensor<*xi32> { | ||||||
|   // CHECK: store [[SIGN_RES]], [[RES]][%arg1, %arg2] : memref<?x10xi32> |   // CHECK: store [[SIGN_RES]], [[RES]][%arg1, %arg2] : memref<?x10xi32> | ||||||
|   // CHECK: return [[RES]] : memref<?x10xi32> |   // CHECK: return [[RES]] : memref<?x10xi32> | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | func @test_conv_no_bias_no_pad(%arg0 : tensor<1x2x32x64xf32>, %arg1 : tensor<5x2x6x7xf32>) -> tensor<*xf32> { | ||||||
|  |   %0 = "onnx.ConvNoBias"(%arg0, %arg1) {auto_pad = "NOTSET", group = 1 : i64} : (tensor<1x2x32x64xf32>, tensor<5x2x6x7xf32>) -> tensor<*xf32> | ||||||
|  |   "std.return"(%0) : (tensor<*xf32>) -> () | ||||||
|  | 
 | ||||||
|  |   // CHECK-LABEL: test_conv_no_bias_no_pad | ||||||
|  |   // CHECK: [[RES:%.+]] = alloc() : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: [[CONST0:%.+]] = constant 5 : index | ||||||
|  |   // CHECK: [[CONST1:%.+]] = constant 0.000000e+00 : f32 | ||||||
|  |   // CHECK: [[CONST2:%.+]] = constant 2 : index | ||||||
|  |   // CHECK: [[OUTER_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||||
|  |   // CHECK: [[OPT_OUTER_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||||
|  |   // CHECK: krnl.return_loops [[OUTER_LOOPS]]#0, [[OUTER_LOOPS]]#1 | ||||||
|  |   // CHECK: } : () -> (!krnl.loop, !krnl.loop) | ||||||
|  | 
 | ||||||
|  |   // CHECK: krnl.iterate([[OPT_OUTER_LOOPS]]#0, [[OPT_OUTER_LOOPS]]#1) with ([[OUTER_LOOPS]]#0 -> %arg2 = 0 to 1, [[OUTER_LOOPS]]#1 -> %arg3 = 0 to 5) { | ||||||
|  |   // CHECK: [[SPATIAL_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||||
|  |   // CHECK: [[OPT_SPATIAL_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||||
|  |   // CHECK: krnl.return_loops [[SPATIAL_LOOPS]]#0, [[SPATIAL_LOOPS]]#1 | ||||||
|  |   // CHECK: } : () -> (!krnl.loop, !krnl.loop) | ||||||
|  | 
 | ||||||
|  |   // CHECK: krnl.iterate([[OPT_SPATIAL_LOOPS]]#0, [[OPT_SPATIAL_LOOPS]]#1) with ([[SPATIAL_LOOPS]]#0 -> %arg4 = 0 to 27, [[SPATIAL_LOOPS]]#1 -> %arg5 = 0 to 58) { | ||||||
|  |   // CHECK: store [[CONST1]], [[RES]][%arg2, %arg3, %arg4, %arg5] : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: [[INNER_LOOPS:%.+]]:3 = krnl.define_loops 3 | ||||||
|  |   // CHECK: [[OPT_INNER_LOOPS:%.+]]:3 = krnl.optimize_loops  { | ||||||
|  |   // CHECK: krnl.return_loops [[INNER_LOOPS]]#0, [[INNER_LOOPS]]#1, [[INNER_LOOPS]]#2 | ||||||
|  |   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop) | ||||||
|  | 
 | ||||||
|  |   // CHECK: krnl.iterate([[OPT_INNER_LOOPS]]#0, [[OPT_INNER_LOOPS]]#1, [[OPT_INNER_LOOPS]]#2) with ([[INNER_LOOPS]]#0 -> %arg6 = 0 to 2, [[INNER_LOOPS]]#1 -> %arg7 = 0 to 6, [[INNER_LOOPS]]#2 -> %arg8 = 0 to 7) { | ||||||
|  |   // CHECK: [[R1PLUSK1:%.+]] = addi %arg4, %arg7 : index | ||||||
|  |   // CHECK: [[R2PLUSK2:%.+]] = addi %arg5, %arg8 : index | ||||||
|  |   // CHECK: [[DATA:%.+]] = load %arg0[%arg2, %arg6, [[R1PLUSK1]], [[R2PLUSK2]]] : memref<1x2x32x64xf32> | ||||||
|  |   // CHECK: [[KERNEL:%.+]] = load %arg1[%arg3, %arg6, %arg7, %arg8] : memref<5x2x6x7xf32> | ||||||
|  |   // CHECK: [[ACC_RES:%.+]] = load %0[%arg2, %arg3, %arg4, %arg5] : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: [[MUL:%.+]] = mulf [[DATA]], [[KERNEL]] : f32 | ||||||
|  |   // CHECK: [[ADD:%.+]] = addf [[ACC_RES]], [[MUL]] : f32 | ||||||
|  |   // CHECK: store [[ADD]], [[RES]][%arg2, %arg3, %arg4, %arg5] : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: } | ||||||
|  |   // CHECK: } | ||||||
|  |   // CHECK: } | ||||||
|  | 
 | ||||||
|  |   // CHECK: return [[RES]] : memref<1x5x27x58xf32> | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func @test_conv_no_bias_no_pad_w_group(%arg0 : tensor<1x9x32x64xf32>, %arg1 : tensor<5x3x6x7xf32>) -> tensor<*xf32> { | ||||||
|  |   %0 = "onnx.ConvNoBias"(%arg0, %arg1) {auto_pad = "NOTSET", group = 3 : i64} : (tensor<1x9x32x64xf32>, tensor<5x3x6x7xf32>) -> tensor<*xf32> | ||||||
|  |   "std.return"(%0) : (tensor<*xf32>) -> () | ||||||
|  | 
 | ||||||
|  |   // CHECK-LABEL: test_conv_no_bias_no_pad_w_group | ||||||
|  |   // CHECK: [[RES:%.+]] = alloc() : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: [[CONST0:%.+]] = constant 1 : index | ||||||
|  |   // CHECK: [[CONST1:%.+]] = constant 0.000000e+00 : f32 | ||||||
|  |   // CHECK: [[CONST2:%.+]] = constant 3 : index | ||||||
|  |   // CHECK: [[OUTER_LOOPS:%.+]]:3 = krnl.define_loops 3 | ||||||
|  |   // CHECK: [[OPT_OUTER_LOOPS:%.+]]:3 = krnl.optimize_loops  { | ||||||
|  |   // CHECK: krnl.return_loops [[OUTER_LOOPS]]#0, [[OUTER_LOOPS]]#1, [[OUTER_LOOPS]]#2 | ||||||
|  |   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop) | ||||||
|  | 
 | ||||||
|  |   // CHECK: krnl.iterate([[OPT_OUTER_LOOPS]]#0, [[OPT_OUTER_LOOPS]]#1, [[OPT_OUTER_LOOPS]]#2) with ([[OUTER_LOOPS]]#0 -> %arg2 = 0 to 1, [[OUTER_LOOPS]]#1 -> %arg3 = 0 to 3, [[OUTER_LOOPS]]#2 -> %arg4 = 0 to 1) { | ||||||
|  |   // CHECK: [[MUL1:%.+]] = muli %arg3, [[CONST0]] : index | ||||||
|  |   // CHECK: %[[ADD1:.+]] = addi [[MUL1]], %arg4 : index | ||||||
|  |   // CHECK: [[SPATIAL_LOOPS:%.+]]:2 = krnl.define_loops 2 | ||||||
|  |   // CHECK: [[OPT_SPATIAL_LOOPS:%.+]]:2 = krnl.optimize_loops  { | ||||||
|  |   // CHECK: krnl.return_loops [[SPATIAL_LOOPS]]#0, [[SPATIAL_LOOPS]]#1 | ||||||
|  |   // CHECK: } : () -> (!krnl.loop, !krnl.loop) | ||||||
|  | 
 | ||||||
|  |   // CHECK: krnl.iterate([[OPT_SPATIAL_LOOPS]]#0, [[OPT_SPATIAL_LOOPS]]#1) with ([[SPATIAL_LOOPS]]#0 -> %arg5 = 0 to 27, [[SPATIAL_LOOPS]]#1 -> %arg6 = 0 to 58) { | ||||||
|  |   // CHECK: store [[CONST1]], [[RES]][%arg2, %[[ADD1]], %arg5, %arg6] : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: [[INNER_LOOPS:%.+]]:3 = krnl.define_loops 3 | ||||||
|  |   // CHECK: [[OPT_INNER_LOOPS:%.+]]:3 = krnl.optimize_loops  { | ||||||
|  |   // CHECK: krnl.return_loops [[INNER_LOOPS]]#0, [[INNER_LOOPS]]#1, [[INNER_LOOPS]]#2 | ||||||
|  |   // CHECK: } : () -> (!krnl.loop, !krnl.loop, !krnl.loop) | ||||||
|  | 
 | ||||||
|  |   // CHECK: krnl.iterate([[OPT_INNER_LOOPS]]#0, [[OPT_INNER_LOOPS]]#1, [[OPT_INNER_LOOPS]]#2) with ([[INNER_LOOPS]]#0 -> %arg7 = 0 to 3, [[INNER_LOOPS]]#1 -> %arg8 = 0 to 6, [[INNER_LOOPS]]#2 -> %arg9 = 0 to 7) { | ||||||
|  |   // CHECK: [[MUL2:%.+]] = muli [[CONST2]], %arg3 : index | ||||||
|  |   // CHECK: [[ADD2:%.+]] = addi %arg7, [[MUL2]] : index | ||||||
|  |   // CHECK: [[R1PLUSK1:%.+]] = addi %arg5, %arg8 : index | ||||||
|  |   // CHECK: [[R2PLUSK2:%.+]] = addi %arg6, %arg9 : index | ||||||
|  |   // CHECK: [[DATA:%.+]] = load %arg0[%arg2, [[ADD2]], [[R1PLUSK1]], [[R2PLUSK2]]] : memref<1x9x32x64xf32> | ||||||
|  |   // CHECK: [[KERNEL:%.+]] = load %arg1[%[[ADD1]], %arg7, %arg8, %arg9] : memref<5x3x6x7xf32> | ||||||
|  |   // CHECK: [[ACC_RES:%.+]] = load %0[%arg2, %[[ADD1]], %arg5, %arg6] : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: [[MUL:%.+]] = mulf [[DATA]], [[KERNEL]] : f32 | ||||||
|  |   // CHECK: [[ADD:%.+]] = addf [[ACC_RES]], [[MUL]] : f32 | ||||||
|  |   // CHECK: store [[ADD]], [[RES]][%arg2, %[[ADD1]], %arg5, %arg6] : memref<1x5x27x58xf32> | ||||||
|  |   // CHECK: } | ||||||
|  |   // CHECK: } | ||||||
|  |   // CHECK: } | ||||||
|  | 
 | ||||||
|  |   // CHECK: return [[RES]] : memref<1x5x27x58xf32> | ||||||
|  | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue