Replace std.load/std.store by affine.load/affine.store (#180)
* Move to more recent LLVM ID (May 15) * clang-format * Bump cache version up * Update readme * Fix doc check * Move to a newer commit id * Update LoopToStandard -> SCFToStandard * Change MLIRSideEffects to MLIRSideEffectInterfaces * Add AffineScope trait to KrnlIterateOp * [ElementWise] Load/Store op to AffineLoad/AffineStore op * [Gemm, MatMul, Reduction, Softmax] Load/Store op to AffineLoad/AffineStore op * [Concat] Load/Store op to AffineLoad/AffineStore op * [Pad, PadConstantValuePad, Reshape, Transpose] Load/Store op to AffineLoad/AffineStore op * [LSTM] Load/Store op to AffineLoad/AffineStore op * [Conv, Norm, Pooling] Load/Store op to AffineLoad/AffineStore op * Add affine-loop-fusion pass * Use Load/Store for scalar * Use Load/Store for scalar * Fix lit tests * Unknown dimensions for broadcasting ops * Affine Load/Store for scalar memref * clang-format Co-authored-by: Gheorghe-Teodor Bercea <gt.bercea@gmail.com> Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
		
							parent
							
								
									2c8f5701bd
								
							
						
					
					
						commit
						7e05f371de
					
				| 
						 | 
				
			
			@ -619,20 +619,35 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
 | 
			
		|||
      for (auto arg : iterationBlock.getArguments())
 | 
			
		||||
        loopIVs.push_back(arg);
 | 
			
		||||
    }
 | 
			
		||||
    // Fold over operands for each of their scalar values
 | 
			
		||||
    // Fold over operands for each of their scalar values.
 | 
			
		||||
    Value accumulated, next;
 | 
			
		||||
    auto accumulatedLoopIVs = getLoopIVsForBroadcasting(
 | 
			
		||||
    // Obtain the first operand.
 | 
			
		||||
    std::vector<Value> accumulatedLoopIVs = getLoopIVsForBroadcasting(
 | 
			
		||||
        loc, rewriter, loopIVs, operands[0], broadcastedDimInfo[0]);
 | 
			
		||||
    accumulated = rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
 | 
			
		||||
    if (!hasAllConstantDimensions(memRefType))
 | 
			
		||||
      // In case of unknown dimensions, use std.load since
 | 
			
		||||
      // 'getLoopIVsForBroadcasting' has not supported affine map so far.
 | 
			
		||||
      accumulated =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
 | 
			
		||||
    else
 | 
			
		||||
      accumulated =
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operands[0], accumulatedLoopIVs);
 | 
			
		||||
    // Iterate over the remaining operands.
 | 
			
		||||
    for (unsigned i = 1; i < numArgs; i++) {
 | 
			
		||||
      auto nextLoopIVs = getLoopIVsForBroadcasting(
 | 
			
		||||
      std::vector<Value> nextLoopIVs = getLoopIVsForBroadcasting(
 | 
			
		||||
          loc, rewriter, loopIVs, operands[i], broadcastedDimInfo[i]);
 | 
			
		||||
      if (!hasAllConstantDimensions(memRefType))
 | 
			
		||||
        // In case of unknown dimensions, use std.load since
 | 
			
		||||
        // 'getLoopIVsForBroadcasting' has not supported affine map so far.
 | 
			
		||||
        next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
 | 
			
		||||
      else
 | 
			
		||||
        next = rewriter.create<AffineLoadOp>(loc, operands[i], nextLoopIVs);
 | 
			
		||||
      accumulated = emitScalarOpFor<ElementwiseVariadicOp>(
 | 
			
		||||
          rewriter, loc, op, memRefType.getElementType(), {accumulated, next});
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Store result in the resulting array.
 | 
			
		||||
    rewriter.create<StoreOp>(loc, accumulated, alloc, loopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopIVs);
 | 
			
		||||
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -156,23 +156,23 @@ struct ONNXGemmOpLowering : public ConversionPattern {
 | 
			
		|||
 | 
			
		||||
    // Initialize the output of A*B
 | 
			
		||||
    auto zero = emitConstantOp(rewriter, loc, memRefType.getElementType(), 0);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, zero, alloc, loopMNIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, zero, alloc, loopMNIVs);
 | 
			
		||||
 | 
			
		||||
    // Compute A*B
 | 
			
		||||
    auto matmulIterateOp = rewriter.create<KrnlIterateOp>(loc, reductionPack);
 | 
			
		||||
 | 
			
		||||
    // Compute beta*C, and add up to alpha*A*B (unidirectional broadcasting)
 | 
			
		||||
    auto loadedAB = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
 | 
			
		||||
    auto loadedAB = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
 | 
			
		||||
    auto alphaAB = rewriter.create<MulFOp>(loc, alpha, loadedAB);
 | 
			
		||||
    if (hasBias) {
 | 
			
		||||
      auto loopCIVs = getLoopIVsForBroadcasting(
 | 
			
		||||
          loc, rewriter, loopMNIVs, C, broadcastedDimInfo);
 | 
			
		||||
      auto loadedC = rewriter.create<LoadOp>(loc, C, loopCIVs);
 | 
			
		||||
      auto loadedC = rewriter.create<AffineLoadOp>(loc, C, loopCIVs);
 | 
			
		||||
      auto betaC = rewriter.create<MulFOp>(loc, beta, loadedC);
 | 
			
		||||
      auto Y = rewriter.create<AddFOp>(loc, alphaAB, betaC);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, Y, alloc, loopMNIVs);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, Y, alloc, loopMNIVs);
 | 
			
		||||
    } else {
 | 
			
		||||
      rewriter.create<StoreOp>(loc, alphaAB, alloc, loopMNIVs);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, alphaAB, alloc, loopMNIVs);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Insert instructions to do matrix multiplication: A*B
 | 
			
		||||
| 
						 | 
				
			
			@ -199,12 +199,12 @@ struct ONNXGemmOpLowering : public ConversionPattern {
 | 
			
		|||
    }
 | 
			
		||||
 | 
			
		||||
    // Matmul computation
 | 
			
		||||
    auto loadedA = rewriter.create<LoadOp>(loc, A, loopAIVs);
 | 
			
		||||
    auto loadedB = rewriter.create<LoadOp>(loc, B, loopBIVs);
 | 
			
		||||
    auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
 | 
			
		||||
    auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopAIVs);
 | 
			
		||||
    auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBIVs);
 | 
			
		||||
    auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
 | 
			
		||||
    auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
 | 
			
		||||
    auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, accumulated, alloc, loopMNIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopMNIVs);
 | 
			
		||||
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -221,7 +221,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
 | 
			
		|||
      }
 | 
			
		||||
 | 
			
		||||
      // Fill the output with value 0.
 | 
			
		||||
      rewriter.create<StoreOp>(loc, zero, alloc, loopBatchMNIVs);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, zero, alloc, loopBatchMNIVs);
 | 
			
		||||
 | 
			
		||||
      //  Iterate along the reduction dimension.
 | 
			
		||||
      //  Use a value from A.
 | 
			
		||||
| 
						 | 
				
			
			@ -265,17 +265,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
 | 
			
		|||
          loopBatchKNIVs.emplace_back(loopMNIVs[0]);
 | 
			
		||||
      }
 | 
			
		||||
      // Matmul computation
 | 
			
		||||
      auto loadedA = rewriter.create<LoadOp>(loc, A, loopBatchMKIVs);
 | 
			
		||||
      auto loadedB = rewriter.create<LoadOp>(loc, B, loopBatchKNIVs);
 | 
			
		||||
      auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopBatchMNIVs);
 | 
			
		||||
      auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopBatchMKIVs);
 | 
			
		||||
      auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBatchKNIVs);
 | 
			
		||||
      auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopBatchMNIVs);
 | 
			
		||||
      if (elementType.isa<IntegerType>()) {
 | 
			
		||||
        auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
 | 
			
		||||
        auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
 | 
			
		||||
        rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
 | 
			
		||||
        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
 | 
			
		||||
      } else if (elementType.isa<FloatType>()) {
 | 
			
		||||
        auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
 | 
			
		||||
        auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
 | 
			
		||||
        rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
 | 
			
		||||
        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
 | 
			
		||||
      }
 | 
			
		||||
    } else if ((AShape.size() == 1) && (BShape.size() == 1)) {
 | 
			
		||||
      // Case 3:
 | 
			
		||||
| 
						 | 
				
			
			@ -283,7 +283,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
 | 
			
		|||
 | 
			
		||||
      // Fill the output with value 0.
 | 
			
		||||
      Value zeroIndex = rewriter.create<ConstantIndexOp>(loc, 0);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, zero, alloc, zeroIndex);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, zero, alloc, zeroIndex);
 | 
			
		||||
 | 
			
		||||
      //  Iterate along the reduction dimension.
 | 
			
		||||
      //  Use a value from A.
 | 
			
		||||
| 
						 | 
				
			
			@ -310,17 +310,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
 | 
			
		|||
      loopKIVs.emplace_back(reduceIterationBlock.getArgument(0));
 | 
			
		||||
 | 
			
		||||
      // Matmul computation
 | 
			
		||||
      auto loadedA = rewriter.create<LoadOp>(loc, A, loopKIVs);
 | 
			
		||||
      auto loadedB = rewriter.create<LoadOp>(loc, B, loopKIVs);
 | 
			
		||||
      auto loadedY = rewriter.create<LoadOp>(loc, alloc, zeroIndex);
 | 
			
		||||
      auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopKIVs);
 | 
			
		||||
      auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopKIVs);
 | 
			
		||||
      auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, zeroIndex);
 | 
			
		||||
      if (elementType.isa<IntegerType>()) {
 | 
			
		||||
        auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
 | 
			
		||||
        auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
 | 
			
		||||
        rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
 | 
			
		||||
        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
 | 
			
		||||
      } else if (elementType.isa<FloatType>()) {
 | 
			
		||||
        auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
 | 
			
		||||
        auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
 | 
			
		||||
        rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
 | 
			
		||||
        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      // No scalar matrix multiplication.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -212,7 +212,7 @@ struct ONNXReductionOpLowering : public ConversionPattern {
 | 
			
		|||
 | 
			
		||||
    Value identity =
 | 
			
		||||
        getIdentityValue<ONNXReductionOp>(rewriter, loc, elementOutType);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, identity, alloc, loopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, identity, alloc, loopIVs);
 | 
			
		||||
 | 
			
		||||
    // Define an Krnl loop to do reduction.
 | 
			
		||||
    rewriter.setInsertionPointAfter(iterateOpInit);
 | 
			
		||||
| 
						 | 
				
			
			@ -256,11 +256,11 @@ struct ONNXReductionOpLowering : public ConversionPattern {
 | 
			
		|||
    }
 | 
			
		||||
 | 
			
		||||
    Value next, accumulated;
 | 
			
		||||
    next = rewriter.create<LoadOp>(loc, operands[0], inLoopIVs);
 | 
			
		||||
    accumulated = rewriter.create<LoadOp>(loc, alloc, outLoopIVs);
 | 
			
		||||
    next = rewriter.create<AffineLoadOp>(loc, operands[0], inLoopIVs);
 | 
			
		||||
    accumulated = rewriter.create<AffineLoadOp>(loc, alloc, outLoopIVs);
 | 
			
		||||
    accumulated = emitScalarOpFor<ONNXReductionOp>(
 | 
			
		||||
        rewriter, loc, op, memRefOutType.getElementType(), {accumulated, next});
 | 
			
		||||
    rewriter.create<StoreOp>(loc, accumulated, alloc, outLoopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, accumulated, alloc, outLoopIVs);
 | 
			
		||||
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
    return success();
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -104,8 +104,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
 | 
			
		|||
        outerLoopIVs.push_back(arg);
 | 
			
		||||
 | 
			
		||||
      // Reset accumulators.
 | 
			
		||||
      rewriter.create<StoreOp>(loc, zero, sumOp);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, negInfinity, maxOp);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
 | 
			
		||||
      rewriter.create<AffineStoreOp>(
 | 
			
		||||
          loc, negInfinity, maxOp, ArrayRef<Value>{});
 | 
			
		||||
 | 
			
		||||
      // Create an inner loop to compute max.
 | 
			
		||||
      maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
 | 
			
		||||
| 
						 | 
				
			
			@ -115,8 +116,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
 | 
			
		|||
      softmaxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
 | 
			
		||||
    } else {
 | 
			
		||||
      // Reset accumulators.
 | 
			
		||||
      rewriter.create<StoreOp>(loc, zero, sumOp);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, negInfinity, maxOp);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
 | 
			
		||||
      rewriter.create<AffineStoreOp>(
 | 
			
		||||
          loc, negInfinity, maxOp, ArrayRef<Value>{});
 | 
			
		||||
 | 
			
		||||
      // Create an inner loop to compute max.
 | 
			
		||||
      maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
 | 
			
		||||
| 
						 | 
				
			
			@ -142,16 +144,16 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
 | 
			
		|||
      maxLoopIVs.push_back(arg);
 | 
			
		||||
 | 
			
		||||
    // Compute the max value.
 | 
			
		||||
    Value max = rewriter.create<LoadOp>(loc, maxOp);
 | 
			
		||||
    Value nextMax = rewriter.create<LoadOp>(loc, input, maxLoopIVs);
 | 
			
		||||
    Value max = rewriter.create<AffineLoadOp>(loc, maxOp);
 | 
			
		||||
    Value nextMax = rewriter.create<AffineLoadOp>(loc, input, maxLoopIVs);
 | 
			
		||||
    auto maxCond =
 | 
			
		||||
        rewriter.create<CmpFOp>(loc, CmpFPredicate::OGT, max, nextMax);
 | 
			
		||||
    max = rewriter.create<SelectOp>(loc, maxCond, max, nextMax);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, max, maxOp);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, max, maxOp, ArrayRef<Value>{});
 | 
			
		||||
 | 
			
		||||
    // Get the max.
 | 
			
		||||
    rewriter.setInsertionPoint(sumIterateOp);
 | 
			
		||||
    max = rewriter.create<LoadOp>(loc, maxOp);
 | 
			
		||||
    max = rewriter.create<AffineLoadOp>(loc, maxOp);
 | 
			
		||||
 | 
			
		||||
    // Insert instructions inside the sum loop.
 | 
			
		||||
    Block &sumIterationBlock = sumIterateOp.bodyRegion().front();
 | 
			
		||||
| 
						 | 
				
			
			@ -165,18 +167,18 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
 | 
			
		|||
      sumLoopIVs.push_back(arg);
 | 
			
		||||
 | 
			
		||||
    // Sum up values.
 | 
			
		||||
    Value sum = rewriter.create<LoadOp>(loc, sumOp);
 | 
			
		||||
    Value next = rewriter.create<LoadOp>(loc, input, sumLoopIVs);
 | 
			
		||||
    Value sum = rewriter.create<AffineLoadOp>(loc, sumOp);
 | 
			
		||||
    Value next = rewriter.create<AffineLoadOp>(loc, input, sumLoopIVs);
 | 
			
		||||
    Value sub = rewriter.create<SubFOp>(loc, next, max);
 | 
			
		||||
    Value exp = rewriter.create<ExpOp>(loc, sub);
 | 
			
		||||
    sum = rewriter.create<AddFOp>(loc, sum, exp);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, sum, sumOp);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, sum, sumOp, ArrayRef<Value>{});
 | 
			
		||||
    // Store intermediate values in the result to avoid recomputation.
 | 
			
		||||
    rewriter.create<StoreOp>(loc, exp, alloc, sumLoopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, exp, alloc, sumLoopIVs);
 | 
			
		||||
 | 
			
		||||
    // Get the sum.
 | 
			
		||||
    rewriter.setInsertionPoint(softmaxIterateOp);
 | 
			
		||||
    sum = rewriter.create<LoadOp>(loc, sumOp);
 | 
			
		||||
    sum = rewriter.create<AffineLoadOp>(loc, sumOp);
 | 
			
		||||
 | 
			
		||||
    // Insert instructions inside the softmax loop.
 | 
			
		||||
    Block &softmaxIterationBlock = softmaxIterateOp.bodyRegion().front();
 | 
			
		||||
| 
						 | 
				
			
			@ -190,9 +192,10 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
 | 
			
		|||
      softmaxLoopIVs.push_back(arg);
 | 
			
		||||
 | 
			
		||||
    // Compute softmax.
 | 
			
		||||
    Value expLoadedVal = rewriter.create<LoadOp>(loc, alloc, softmaxLoopIVs);
 | 
			
		||||
    Value expLoadedVal =
 | 
			
		||||
        rewriter.create<AffineLoadOp>(loc, alloc, softmaxLoopIVs);
 | 
			
		||||
    Value result = rewriter.create<DivFOp>(loc, expLoadedVal, sum);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, result, alloc, softmaxLoopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, result, alloc, softmaxLoopIVs);
 | 
			
		||||
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -129,10 +129,14 @@ struct ONNXConvOpLowering : public ConversionPattern {
 | 
			
		|||
      if (group > 1) {
 | 
			
		||||
        // Middle loop is over groups and third loop is over the
 | 
			
		||||
        // kernel identifiers in the current group.
 | 
			
		||||
        auto kernelsOffset = rewriter.create<MulIOp>(
 | 
			
		||||
            loc, outerLoops.getInductionVar(gIndex), kernelsPerGroupValue);
 | 
			
		||||
        kernel = rewriter.create<AddIOp>(
 | 
			
		||||
            loc, kernelsOffset, outerLoops.getInductionVar(mIndex));
 | 
			
		||||
        AffineMap kernelMap = AffineMap::get(2, 1,
 | 
			
		||||
            /*gIndex=*/rewriter.getAffineDimExpr(0) *
 | 
			
		||||
                    /*kernelsPerGroup=*/rewriter.getAffineSymbolExpr(0) +
 | 
			
		||||
                /*mIndex=*/rewriter.getAffineDimExpr(1));
 | 
			
		||||
        kernel = rewriter.create<AffineApplyOp>(loc, kernelMap,
 | 
			
		||||
            ArrayRef<Value>{/*gIndex=*/outerLoops.getInductionVar(gIndex),
 | 
			
		||||
                /*kernelsPerGroupValue=*/kernelsPerGroupValue,
 | 
			
		||||
                /*mIndex=*/outerLoops.getInductionVar(mIndex)});
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // 2.2 Define spatial loops
 | 
			
		||||
| 
						 | 
				
			
			@ -209,9 +213,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
 | 
			
		|||
                        /*subchannel=*/rewriter.getAffineSymbolExpr(0) +
 | 
			
		||||
                    /*c=*/rewriter.getAffineDimExpr(1));
 | 
			
		||||
            channelDepth = rewriter.create<AffineApplyOp>(loc, indexMap,
 | 
			
		||||
                ValueRange(
 | 
			
		||||
                ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
 | 
			
		||||
                        /*c=*/channelDepth, /*subchannel=*/subchannels}));
 | 
			
		||||
                    /*c=*/channelDepth, /*subchannel=*/subchannels});
 | 
			
		||||
          }
 | 
			
		||||
          dataIndices.emplace_back(channelDepth);
 | 
			
		||||
          // sX * rX + kX
 | 
			
		||||
| 
						 | 
				
			
			@ -231,8 +234,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
 | 
			
		|||
                /*sX=*/rewriter.getAffineDimExpr(0) * /*rX=*/stride +
 | 
			
		||||
                    /*kX=*/rewriter.getAffineDimExpr(1));
 | 
			
		||||
            Value outIV = rewriter.create<AffineApplyOp>(loc, indexMap,
 | 
			
		||||
                ValueRange(ArrayRef<Value>{spatialLoops.getInductionVar(i),
 | 
			
		||||
                    innerLoops.getInductionVar(i + 1)}));
 | 
			
		||||
                ArrayRef<Value>{spatialLoops.getInductionVar(i),
 | 
			
		||||
                    innerLoops.getInductionVar(i + 1)});
 | 
			
		||||
            dataIndices.emplace_back(outIV);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -79,10 +79,10 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
 | 
			
		|||
      loopCIVs.emplace_back(rewriter.create<ConstantIndexOp>(loc, 0));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    auto scaleVal = rewriter.create<LoadOp>(loc, scale, loopCIVs);
 | 
			
		||||
    auto biasVal = rewriter.create<LoadOp>(loc, bias, loopCIVs);
 | 
			
		||||
    auto meanVal = rewriter.create<LoadOp>(loc, mean, loopCIVs);
 | 
			
		||||
    auto varianceVal = rewriter.create<LoadOp>(loc, variance, loopCIVs);
 | 
			
		||||
    auto scaleVal = rewriter.create<AffineLoadOp>(loc, scale, loopCIVs);
 | 
			
		||||
    auto biasVal = rewriter.create<AffineLoadOp>(loc, bias, loopCIVs);
 | 
			
		||||
    auto meanVal = rewriter.create<AffineLoadOp>(loc, mean, loopCIVs);
 | 
			
		||||
    auto varianceVal = rewriter.create<AffineLoadOp>(loc, variance, loopCIVs);
 | 
			
		||||
 | 
			
		||||
    // Create a KrnlIterateOp along the other dimensions.
 | 
			
		||||
    SmallVector<int64_t, 4> axes;
 | 
			
		||||
| 
						 | 
				
			
			@ -118,7 +118,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
 | 
			
		|||
      loopIVs.emplace_back(args[0]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    auto xVal = rewriter.create<LoadOp>(loc, operand, loopIVs);
 | 
			
		||||
    auto xVal = rewriter.create<AffineLoadOp>(loc, operand, loopIVs);
 | 
			
		||||
    // normalize
 | 
			
		||||
    auto dividend = rewriter.create<SubFOp>(loc, xVal, meanVal);
 | 
			
		||||
    auto adjustedVarianceVal =
 | 
			
		||||
| 
						 | 
				
			
			@ -129,7 +129,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
 | 
			
		|||
    auto scaleNormVal = rewriter.create<MulFOp>(loc, scaleVal, normVal);
 | 
			
		||||
    auto shiftScaleNormVal =
 | 
			
		||||
        rewriter.create<AddFOp>(loc, scaleNormVal, biasVal);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
 | 
			
		||||
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -100,7 +100,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
 | 
			
		|||
    ArrayRef<Value> poolDimValues) {
 | 
			
		||||
  // AveragePool's result type is FloatType, so it's safe to use DivFOp, SubFOp.
 | 
			
		||||
  bool countIncludePad = getCountIncludePad<ONNXAveragePoolOp>(poolOp);
 | 
			
		||||
  Value numerator = rewriter.create<LoadOp>(loc, alloc, resultIndices);
 | 
			
		||||
  Value numerator = rewriter.create<AffineLoadOp>(loc, alloc, resultIndices);
 | 
			
		||||
  Value denominator;
 | 
			
		||||
  if (countIncludePad) {
 | 
			
		||||
    int64_t kernelSize = 1;
 | 
			
		||||
| 
						 | 
				
			
			@ -120,7 +120,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
 | 
			
		|||
 | 
			
		||||
  Value average = rewriter.create<DivFOp>(loc, numerator, denominator);
 | 
			
		||||
 | 
			
		||||
  rewriter.create<StoreOp>(loc, average, alloc, resultIndices);
 | 
			
		||||
  rewriter.create<AffineStoreOp>(loc, average, alloc, resultIndices);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//===----------------------------------------------------------------------===//
 | 
			
		||||
| 
						 | 
				
			
			@ -167,9 +167,7 @@ Value insertAllocAndDeallocForPooling(ConversionPatternRewriter &rewriter,
 | 
			
		|||
              dilations.empty() ? 1 : dilations[spatialIndex]));
 | 
			
		||||
 | 
			
		||||
      // Apply the affine map.
 | 
			
		||||
      Value dimVal =
 | 
			
		||||
          rewriter.create<AffineApplyOp>(loc, dimMap, ValueRange(dimArgs));
 | 
			
		||||
 | 
			
		||||
      Value dimVal = rewriter.create<AffineApplyOp>(loc, dimMap, dimArgs);
 | 
			
		||||
      allocOperands.emplace_back(dimVal);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
| 
						 | 
				
			
			@ -346,7 +344,7 @@ struct ONNXPoolOpLowering : public ConversionPattern {
 | 
			
		|||
        outputIndices.emplace_back(outputLoops.getInductionVar(i));
 | 
			
		||||
 | 
			
		||||
      // 2.1 Emit: output[n][c][ho][wo] = identity
 | 
			
		||||
      rewriter.create<StoreOp>(loc, identity, alloc, outputIndices);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, identity, alloc, outputIndices);
 | 
			
		||||
 | 
			
		||||
      // 2.2 Emit affine maps which express the lower and upper bounds for the
 | 
			
		||||
      // pooling window's dimensions.
 | 
			
		||||
| 
						 | 
				
			
			@ -441,11 +439,11 @@ struct ONNXPoolOpLowering : public ConversionPattern {
 | 
			
		|||
      { // Construct poolStartValues and poolDimValues.
 | 
			
		||||
        for (int i = 0; i < kernelShape.size(); ++i) {
 | 
			
		||||
          Value startIndex = rewriter.create<AffineMaxOp>(
 | 
			
		||||
              loc, poolStartMap, ValueRange(IVsAndConstants[i]));
 | 
			
		||||
              loc, poolStartMap, IVsAndConstants[i]);
 | 
			
		||||
          poolStartValues.emplace_back(startIndex);
 | 
			
		||||
 | 
			
		||||
          Value endIndex = rewriter.create<AffineMinOp>(
 | 
			
		||||
              loc, poolEndMap, ValueRange(IVsAndConstants[i]));
 | 
			
		||||
          Value endIndex =
 | 
			
		||||
              rewriter.create<AffineMinOp>(loc, poolEndMap, IVsAndConstants[i]);
 | 
			
		||||
 | 
			
		||||
          Value dim = rewriter.create<SubIOp>(loc, endIndex, startIndex);
 | 
			
		||||
          if (isDilated) {
 | 
			
		||||
| 
						 | 
				
			
			@ -514,10 +512,10 @@ struct ONNXPoolOpLowering : public ConversionPattern {
 | 
			
		|||
        Value loadInput =
 | 
			
		||||
            rewriter.create<LoadOp>(loc, inputOperand, inputIndices);
 | 
			
		||||
        Value loadPartialOutput =
 | 
			
		||||
            rewriter.create<LoadOp>(loc, alloc, outputIndices);
 | 
			
		||||
            rewriter.create<AffineLoadOp>(loc, alloc, outputIndices);
 | 
			
		||||
        Value output = emitScalarOpFor<PoolOp>(rewriter, loc, op,
 | 
			
		||||
            outputElementType, {loadPartialOutput, loadInput});
 | 
			
		||||
        rewriter.create<StoreOp>(loc, output, alloc, outputIndices);
 | 
			
		||||
        rewriter.create<AffineStoreOp>(loc, output, alloc, outputIndices);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // 2.5 Post-processing for the pooling window, e.g. taking average.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -222,13 +222,15 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
 | 
			
		|||
 | 
			
		||||
    Value hiddenVal = zero;
 | 
			
		||||
    if (!isNoneType(operandAdaptor.initial_h()))
 | 
			
		||||
      hiddenVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_h(), IVs);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, hiddenVal, state.ht, IVs);
 | 
			
		||||
      hiddenVal =
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_h(), IVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, hiddenVal, state.ht, IVs);
 | 
			
		||||
 | 
			
		||||
    Value cellVal = zero;
 | 
			
		||||
    if (!isNoneType(operandAdaptor.initial_c()))
 | 
			
		||||
      cellVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_c(), IVs);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, cellVal, state.ct, IVs);
 | 
			
		||||
      cellVal =
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_c(), IVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, cellVal, state.ct, IVs);
 | 
			
		||||
  }
 | 
			
		||||
  rewriter.restoreInsertionPoint(ipInitializationLoops);
 | 
			
		||||
  return state;
 | 
			
		||||
| 
						 | 
				
			
			@ -320,8 +322,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
        for (unsigned i = 0; i < 4; ++i) {
 | 
			
		||||
          Value wHiddenIV =
 | 
			
		||||
              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
 | 
			
		||||
                  ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
 | 
			
		||||
                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
 | 
			
		||||
                  std::vector<Value>{/*iv=*/hiddenIV,
 | 
			
		||||
                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal});
 | 
			
		||||
          wbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, wHiddenIV});
 | 
			
		||||
        }
 | 
			
		||||
        // Rb[iofc]
 | 
			
		||||
| 
						 | 
				
			
			@ -329,8 +331,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
          SmallVector<Value, 4> rbIVs;
 | 
			
		||||
          Value rHiddenIV =
 | 
			
		||||
              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
 | 
			
		||||
                  ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
 | 
			
		||||
                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
 | 
			
		||||
                  std::vector<Value>{/*iv=*/hiddenIV,
 | 
			
		||||
                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal});
 | 
			
		||||
          rbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, rHiddenIV});
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
| 
						 | 
				
			
			@ -339,17 +341,16 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
      if (hasPeepholes) {
 | 
			
		||||
        for (unsigned i = 0; i < 3; ++i) {
 | 
			
		||||
          SmallVector<Value, 4> pIVs;
 | 
			
		||||
          Value pHiddenIV =
 | 
			
		||||
              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
 | 
			
		||||
                  ValueRange(std::vector<Value>{
 | 
			
		||||
                      hiddenIV, constantIndices[i], hiddenDimVal}));
 | 
			
		||||
          Value pHiddenIV = rewriter.create<AffineApplyOp>(loc,
 | 
			
		||||
              accessByOffsetMap,
 | 
			
		||||
              std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
 | 
			
		||||
          pIOFIVs.emplace_back(SmallVector<Value, 2>{directionIV, pHiddenIV});
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Value loadH = rewriter.create<LoadOp>(loc, state.ht, hIVs);
 | 
			
		||||
    Value loadC = rewriter.create<LoadOp>(loc, state.ct, cIVs);
 | 
			
		||||
    Value loadH = rewriter.create<AffineLoadOp>(loc, state.ht, hIVs);
 | 
			
		||||
    Value loadC = rewriter.create<AffineLoadOp>(loc, state.ct, cIVs);
 | 
			
		||||
 | 
			
		||||
    // Emit instructions for matrix multiplications:
 | 
			
		||||
    //   Xt*(Wi^T), Xt*(Wo^T), Xt*(Wf^t), Xt*(Wc^T)
 | 
			
		||||
| 
						 | 
				
			
			@ -361,9 +362,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
    MemRefType scalarMemRefType = MemRefType::get({}, elementType, {}, 0);
 | 
			
		||||
    for (unsigned i = 0; i < 4; ++i) {
 | 
			
		||||
      Value xwAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, zero, xwAlloc);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, zero, xwAlloc, ArrayRef<Value>{});
 | 
			
		||||
      Value hrAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, zero, hrAlloc);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, zero, hrAlloc, ArrayRef<Value>{});
 | 
			
		||||
      xwIOFC.emplace_back(xwAlloc);
 | 
			
		||||
      hrIOFC.emplace_back(hrAlloc);
 | 
			
		||||
    }
 | 
			
		||||
| 
						 | 
				
			
			@ -390,10 +391,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
        // R[iofc] :: [num_directions, 4*hidden_size, input_size]
 | 
			
		||||
        for (unsigned i = 0; i < 4; ++i) {
 | 
			
		||||
          SmallVector<Value, 4> wIVs, rIVs;
 | 
			
		||||
          Value wHiddenIV =
 | 
			
		||||
              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
 | 
			
		||||
                  ValueRange(std::vector<Value>{
 | 
			
		||||
                      hiddenIV, constantIndices[i], hiddenDimVal}));
 | 
			
		||||
          Value wHiddenIV = rewriter.create<AffineApplyOp>(loc,
 | 
			
		||||
              accessByOffsetMap,
 | 
			
		||||
              std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
 | 
			
		||||
 | 
			
		||||
          wIVs = {directionIV, wHiddenIV, reductionIV};
 | 
			
		||||
          wIOFCIVs.emplace_back(wIVs);
 | 
			
		||||
| 
						 | 
				
			
			@ -402,77 +402,80 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
          rIOFCIVs.emplace_back(rIVs);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Value loadX = rewriter.create<LoadOp>(loc, operandAdaptor.X(), xIVs);
 | 
			
		||||
        Value loadX =
 | 
			
		||||
            rewriter.create<AffineLoadOp>(loc, operandAdaptor.X(), xIVs);
 | 
			
		||||
        for (unsigned i = 0; i < 4; ++i) {
 | 
			
		||||
          // Xt * Wiofc
 | 
			
		||||
          Value loadW =
 | 
			
		||||
              rewriter.create<LoadOp>(loc, operandAdaptor.W(), wIOFCIVs[i]);
 | 
			
		||||
          Value loadW = rewriter.create<AffineLoadOp>(
 | 
			
		||||
              loc, operandAdaptor.W(), wIOFCIVs[i]);
 | 
			
		||||
          Value xwVal = rewriter.create<MulFOp>(loc, loadX, loadW);
 | 
			
		||||
          Value loadXW = rewriter.create<LoadOp>(loc, xwIOFC[i]);
 | 
			
		||||
          Value loadXW = rewriter.create<AffineLoadOp>(loc, xwIOFC[i]);
 | 
			
		||||
          Value nextXW = rewriter.create<AddFOp>(loc, loadXW, xwVal);
 | 
			
		||||
          rewriter.create<StoreOp>(loc, nextXW, xwIOFC[i]);
 | 
			
		||||
          rewriter.create<AffineStoreOp>(
 | 
			
		||||
              loc, nextXW, xwIOFC[i], ArrayRef<Value>{});
 | 
			
		||||
          // Ht-1 * Riofc
 | 
			
		||||
          Value loadR =
 | 
			
		||||
              rewriter.create<LoadOp>(loc, operandAdaptor.R(), rIOFCIVs[i]);
 | 
			
		||||
          Value loadR = rewriter.create<AffineLoadOp>(
 | 
			
		||||
              loc, operandAdaptor.R(), rIOFCIVs[i]);
 | 
			
		||||
          Value hrVal = rewriter.create<MulFOp>(loc, loadH, loadR);
 | 
			
		||||
          Value loadHR = rewriter.create<LoadOp>(loc, hrIOFC[i]);
 | 
			
		||||
          Value loadHR = rewriter.create<AffineLoadOp>(loc, hrIOFC[i]);
 | 
			
		||||
          Value nextHR = rewriter.create<AddFOp>(loc, loadHR, hrVal);
 | 
			
		||||
          rewriter.create<StoreOp>(loc, nextHR, hrIOFC[i]);
 | 
			
		||||
          rewriter.create<AffineStoreOp>(
 | 
			
		||||
              loc, nextHR, hrIOFC[i], ArrayRef<Value>{});
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      rewriter.restoreInsertionPoint(ipReductionLoops);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
 | 
			
		||||
    Value loadXWI = rewriter.create<LoadOp>(loc, xwIOFC[0]);
 | 
			
		||||
    Value loadHRI = rewriter.create<LoadOp>(loc, hrIOFC[0]);
 | 
			
		||||
    Value loadXWI = rewriter.create<AffineLoadOp>(loc, xwIOFC[0]);
 | 
			
		||||
    Value loadHRI = rewriter.create<AffineLoadOp>(loc, hrIOFC[0]);
 | 
			
		||||
    Value it = rewriter.create<AddFOp>(loc, loadXWI, loadHRI);
 | 
			
		||||
    if (hasPeepholes) {
 | 
			
		||||
      Value loadP =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
 | 
			
		||||
      Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
 | 
			
		||||
      it = rewriter.create<AddFOp>(loc, it, PC);
 | 
			
		||||
    }
 | 
			
		||||
    if (hasBiasForInput) {
 | 
			
		||||
      Value loadWB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
 | 
			
		||||
      it = rewriter.create<AddFOp>(loc, it, loadWB);
 | 
			
		||||
      Value loadRB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
 | 
			
		||||
      it = rewriter.create<AddFOp>(loc, it, loadRB);
 | 
			
		||||
    }
 | 
			
		||||
    it = applyActivation(rewriter, loc, activationPack.f, it);
 | 
			
		||||
 | 
			
		||||
    // ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
 | 
			
		||||
    Value loadXWF = rewriter.create<LoadOp>(loc, xwIOFC[2]);
 | 
			
		||||
    Value loadHRF = rewriter.create<LoadOp>(loc, hrIOFC[2]);
 | 
			
		||||
    Value loadXWF = rewriter.create<AffineLoadOp>(loc, xwIOFC[2]);
 | 
			
		||||
    Value loadHRF = rewriter.create<AffineLoadOp>(loc, hrIOFC[2]);
 | 
			
		||||
    Value ft = rewriter.create<AddFOp>(loc, loadXWF, loadHRF);
 | 
			
		||||
    if (hasPeepholes) {
 | 
			
		||||
      Value loadP =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
 | 
			
		||||
      Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
 | 
			
		||||
      ft = rewriter.create<AddFOp>(loc, ft, PC);
 | 
			
		||||
    }
 | 
			
		||||
    if (hasBiasForInput) {
 | 
			
		||||
      Value loadWB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
 | 
			
		||||
      ft = rewriter.create<AddFOp>(loc, ft, loadWB);
 | 
			
		||||
      Value loadRB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
 | 
			
		||||
      ft = rewriter.create<AddFOp>(loc, ft, loadRB);
 | 
			
		||||
    }
 | 
			
		||||
    ft = applyActivation(rewriter, loc, activationPack.f, ft);
 | 
			
		||||
 | 
			
		||||
    // ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
 | 
			
		||||
    Value loadXWC = rewriter.create<LoadOp>(loc, xwIOFC[3]);
 | 
			
		||||
    Value loadHRC = rewriter.create<LoadOp>(loc, hrIOFC[3]);
 | 
			
		||||
    Value loadXWC = rewriter.create<AffineLoadOp>(loc, xwIOFC[3]);
 | 
			
		||||
    Value loadHRC = rewriter.create<AffineLoadOp>(loc, hrIOFC[3]);
 | 
			
		||||
    Value ct = rewriter.create<AddFOp>(loc, loadXWC, loadHRC);
 | 
			
		||||
    if (hasBiasForInput) {
 | 
			
		||||
      Value loadWB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
 | 
			
		||||
      ct = rewriter.create<AddFOp>(loc, ct, loadWB);
 | 
			
		||||
      Value loadRB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
 | 
			
		||||
      ct = rewriter.create<AddFOp>(loc, ct, loadRB);
 | 
			
		||||
    }
 | 
			
		||||
    ct = applyActivation(rewriter, loc, activationPack.g, ct);
 | 
			
		||||
| 
						 | 
				
			
			@ -481,24 +484,24 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
    Value FtCt1 = rewriter.create<MulFOp>(loc, ft, loadC);
 | 
			
		||||
    Value itct = rewriter.create<MulFOp>(loc, it, ct);
 | 
			
		||||
    Value Ct = rewriter.create<AddFOp>(loc, FtCt1, itct);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, Ct, state.ct, cIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, Ct, state.ct, cIVs);
 | 
			
		||||
 | 
			
		||||
    // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
 | 
			
		||||
    Value loadXWO = rewriter.create<LoadOp>(loc, xwIOFC[1]);
 | 
			
		||||
    Value loadHRO = rewriter.create<LoadOp>(loc, hrIOFC[1]);
 | 
			
		||||
    Value loadXWO = rewriter.create<AffineLoadOp>(loc, xwIOFC[1]);
 | 
			
		||||
    Value loadHRO = rewriter.create<AffineLoadOp>(loc, hrIOFC[1]);
 | 
			
		||||
    Value ot = rewriter.create<AddFOp>(loc, loadXWO, loadHRO);
 | 
			
		||||
    if (hasPeepholes) {
 | 
			
		||||
      Value loadP =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
 | 
			
		||||
      Value PC = rewriter.create<MulFOp>(loc, loadP, Ct);
 | 
			
		||||
      ot = rewriter.create<AddFOp>(loc, ot, PC);
 | 
			
		||||
    }
 | 
			
		||||
    if (hasBiasForInput) {
 | 
			
		||||
      Value loadWB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
 | 
			
		||||
      ot = rewriter.create<AddFOp>(loc, ot, loadWB);
 | 
			
		||||
      Value loadRB =
 | 
			
		||||
          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
 | 
			
		||||
      ot = rewriter.create<AddFOp>(loc, ot, loadRB);
 | 
			
		||||
    }
 | 
			
		||||
    ot = applyActivation(rewriter, loc, activationPack.f, ot);
 | 
			
		||||
| 
						 | 
				
			
			@ -506,12 +509,12 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
 | 
			
		|||
    // Ht = ot (.) h(Ct)
 | 
			
		||||
    Value hCt = applyActivation(rewriter, loc, activationPack.h, Ct);
 | 
			
		||||
    Value Ht = rewriter.create<MulFOp>(loc, ot, hCt);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, Ht, state.ht, hIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, Ht, state.ht, hIVs);
 | 
			
		||||
 | 
			
		||||
    // Store the current Ht if required.
 | 
			
		||||
    if (!isNoneType(state.allH)) {
 | 
			
		||||
      SmallVector<Value, 4> allHIVs{sequenceIV, directionIV, batchIV, hiddenIV};
 | 
			
		||||
      rewriter.create<StoreOp>(loc, Ht, state.allH, allHIVs);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, Ht, state.allH, allHIVs);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Deallocate the temporary results of matrix multiplications.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,7 +28,7 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
 | 
			
		|||
  MemRefType scalarMemRefType =
 | 
			
		||||
      MemRefType::get({}, scalarOperand.getType(), {}, 0);
 | 
			
		||||
  Value alloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
 | 
			
		||||
  rewriter.create<StoreOp>(loc, scalarOperand, alloc);
 | 
			
		||||
  rewriter.create<AffineStoreOp>(loc, scalarOperand, alloc, ArrayRef<Value>{});
 | 
			
		||||
 | 
			
		||||
  std::vector<mlir::NamedAttribute> attributes;
 | 
			
		||||
  if (activation.alpha) {
 | 
			
		||||
| 
						 | 
				
			
			@ -68,6 +68,6 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
 | 
			
		|||
  else
 | 
			
		||||
    llvm_unreachable("Unsupported activation");
 | 
			
		||||
 | 
			
		||||
  Value result = rewriter.create<LoadOp>(loc, res);
 | 
			
		||||
  Value result = rewriter.create<AffineLoadOp>(loc, res);
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -126,9 +126,9 @@ struct ONNXRNNOpLowering : public ConversionPattern {
 | 
			
		|||
            rewriter.getIndexType(), (direction == REVERSE) ? 0 : 1);
 | 
			
		||||
        Value reverseSequenceIV =
 | 
			
		||||
            rewriter.create<AffineApplyOp>(loc, reverseIVMap,
 | 
			
		||||
                ValueRange(std::vector<Value>{sequenceLoops.getInductionVar(0),
 | 
			
		||||
                std::vector<Value>{sequenceLoops.getInductionVar(0),
 | 
			
		||||
                    emitConstantOp(rewriter, loc, rewriter.getIndexType(),
 | 
			
		||||
                        sequenceDimSize)}));
 | 
			
		||||
                        sequenceDimSize)});
 | 
			
		||||
        // Emit calculation for one RNN step.
 | 
			
		||||
        calculateState<RNNOp, S, A>(rewriter, loc, operandAdaptor, state,
 | 
			
		||||
            activationReverse, directionIV, reverseSequenceIV);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -59,15 +59,18 @@ struct ONNXConcatOpLowering : public ConversionPattern {
 | 
			
		|||
        if (r != axis || writeOffset == 0) {
 | 
			
		||||
          writeIndices.emplace_back(inputLoops.getInductionVar(r));
 | 
			
		||||
        } else {
 | 
			
		||||
          auto indexWithOffset = rewriter.create<AddIOp>(loc,
 | 
			
		||||
              rewriter.create<ConstantIndexOp>(loc, writeOffset),
 | 
			
		||||
              inputLoops.getInductionVar(r));
 | 
			
		||||
          AffineMap indexWithOffsetMap =
 | 
			
		||||
              AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + writeOffset);
 | 
			
		||||
          Value indexWithOffset =
 | 
			
		||||
              rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
 | 
			
		||||
                  ArrayRef<Value>{inputLoops.getInductionVar(r)});
 | 
			
		||||
          writeIndices.emplace_back(indexWithOffset);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      // Insert copy.
 | 
			
		||||
      auto loadData = rewriter.create<LoadOp>(loc, operands[i], readIndices);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, loadData, alloc, writeIndices);
 | 
			
		||||
      auto loadData =
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operands[i], readIndices);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, loadData, alloc, writeIndices);
 | 
			
		||||
      // Increment offset
 | 
			
		||||
      writeOffset += currShape[axis];
 | 
			
		||||
    }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -88,16 +88,17 @@ struct ONNXPadOpLowering : public ConversionPattern {
 | 
			
		|||
      if (pads[i] == 0) {
 | 
			
		||||
        outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
 | 
			
		||||
      } else {
 | 
			
		||||
        auto outIV = rewriter.create<AddIOp>(loc,
 | 
			
		||||
            rewriter.create<ConstantIndexOp>(loc, pads[i]),
 | 
			
		||||
            valueLoops.getInductionVar(i));
 | 
			
		||||
        AffineMap indexWithOffsetMap =
 | 
			
		||||
            AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pads[i]);
 | 
			
		||||
        Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
 | 
			
		||||
            ArrayRef<Value>{valueLoops.getInductionVar(i)});
 | 
			
		||||
        outLoopIVs.emplace_back(outIV);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    auto originValue =
 | 
			
		||||
        rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, originValue, alloc, outLoopIVs);
 | 
			
		||||
        rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, originValue, alloc, outLoopIVs);
 | 
			
		||||
    rewriter.setInsertionPointToStart(padLoops.getIterateBlock());
 | 
			
		||||
 | 
			
		||||
    SmallVector<Value, 4> outLoopIVs1;
 | 
			
		||||
| 
						 | 
				
			
			@ -105,7 +106,7 @@ struct ONNXPadOpLowering : public ConversionPattern {
 | 
			
		|||
      outLoopIVs1.emplace_back(padLoops.getInductionVar(i));
 | 
			
		||||
 | 
			
		||||
    auto paddingValue = rewriter.create<ConstantOp>(loc, valueAttr);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, paddingValue, alloc, outLoopIVs1);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, paddingValue, alloc, outLoopIVs1);
 | 
			
		||||
 | 
			
		||||
    // Replace the original op with the generated code.
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -77,15 +77,17 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
 | 
			
		|||
      if (pad_begin[i] == 0) {
 | 
			
		||||
        outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
 | 
			
		||||
      } else {
 | 
			
		||||
        auto outIV = rewriter.create<AddIOp>(loc,
 | 
			
		||||
            rewriter.create<ConstantIndexOp>(loc, pad_begin[i]),
 | 
			
		||||
            valueLoops.getInductionVar(i));
 | 
			
		||||
        AffineMap indexWithOffsetMap =
 | 
			
		||||
            AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pad_begin[i]);
 | 
			
		||||
        Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
 | 
			
		||||
            ArrayRef<Value>{valueLoops.getInductionVar(i)});
 | 
			
		||||
        outLoopIVs.emplace_back(outIV);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    auto inVal = rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
 | 
			
		||||
    auto inVal =
 | 
			
		||||
        rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
 | 
			
		||||
    rewriter.setInsertionPointToStart(padLoops.getIterateBlock());
 | 
			
		||||
 | 
			
		||||
    SmallVector<Value, 4> outLoopIVs1;
 | 
			
		||||
| 
						 | 
				
			
			@ -93,7 +95,7 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
 | 
			
		|||
      outLoopIVs1.emplace_back(padLoops.getInductionVar(i));
 | 
			
		||||
 | 
			
		||||
    auto inVal1 = rewriter.create<ConstantOp>(loc, constantValAttr);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, inVal1, alloc, outLoopIVs1);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, inVal1, alloc, outLoopIVs1);
 | 
			
		||||
 | 
			
		||||
    // Replace the original op with the generated code.
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -64,7 +64,8 @@ struct ONNXReshapeOpLowering : public ConversionPattern {
 | 
			
		|||
      for (int i = 0; i < memRefShape.size(); ++i) {
 | 
			
		||||
        Value index = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
 | 
			
		||||
        // Load index from array of indices.
 | 
			
		||||
        Value loadedVal = rewriter.create<LoadOp>(loc, operands[1], index);
 | 
			
		||||
        Value loadedVal =
 | 
			
		||||
            rewriter.create<AffineLoadOp>(loc, operands[1], index);
 | 
			
		||||
        // If a dimension is zero, the actual dimension value is taken from the
 | 
			
		||||
        // input tensor.
 | 
			
		||||
        //
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -92,8 +92,9 @@ struct ONNXSplitOpLowering : public ConversionPattern {
 | 
			
		|||
        writeIndices.emplace_back(outputLoops.getInductionVar(r));
 | 
			
		||||
      }
 | 
			
		||||
      // Insert copy.
 | 
			
		||||
      auto loadData = rewriter.create<LoadOp>(loc, operands[0], readIndices);
 | 
			
		||||
      rewriter.create<StoreOp>(loc, loadData, allocs[i], writeIndices);
 | 
			
		||||
      auto loadData =
 | 
			
		||||
          rewriter.create<AffineLoadOp>(loc, operands[0], readIndices);
 | 
			
		||||
      rewriter.create<AffineStoreOp>(loc, loadData, allocs[i], writeIndices);
 | 
			
		||||
    }
 | 
			
		||||
    rewriter.replaceOp(op, allocs);
 | 
			
		||||
    return success();
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -80,8 +80,8 @@ struct ONNXTransposeOpLowering : public ConversionPattern {
 | 
			
		|||
    for (int i = 0; i < iterationBlock.getArguments().size(); ++i)
 | 
			
		||||
      outLoopIVs.emplace_back(iterationBlock.getArguments()[perm[i]]);
 | 
			
		||||
 | 
			
		||||
    auto inVal = rewriter.create<LoadOp>(loc, data, inLoopIVs);
 | 
			
		||||
    rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
 | 
			
		||||
    auto inVal = rewriter.create<AffineLoadOp>(loc, data, inLoopIVs);
 | 
			
		||||
    rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
 | 
			
		||||
 | 
			
		||||
    rewriter.replaceOp(op, alloc);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,10 +14,10 @@ func @test_enable_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
 | 
			
		|||
  // CHECK: krnl.define_loops
 | 
			
		||||
  // CHECK: krnl.optimize_loops
 | 
			
		||||
  // CHECK: krnl.iterate
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: krnl.define_loops
 | 
			
		||||
  // CHECK: krnl.optimize_loops
 | 
			
		||||
  // CHECK: krnl.iterate
 | 
			
		||||
| 
						 | 
				
			
			@ -43,26 +43,26 @@ func @test_enable_memory_pool_2(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf3
 | 
			
		|||
  // CHECK: krnl.define_loops
 | 
			
		||||
  // CHECK: krnl.optimize_loops
 | 
			
		||||
  // CHECK: krnl.iterate
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: krnl.define_loops
 | 
			
		||||
  // CHECK: krnl.optimize_loops
 | 
			
		||||
  // CHECK: krnl.iterate
 | 
			
		||||
  // CHECK: [[LOAD3:%.+]] = load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD4:%.+]] = load %arg1[%arg4, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[LOAD5:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[LOAD3:%.+]] = affine.load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD4:%.+]] = affine.load %arg1[%arg4, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[LOAD5:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[MULF1:%.+]] = mulf [[LOAD3]], [[LOAD4]] : f32
 | 
			
		||||
  // CHECK: [[ADDF2:%.+]] = addf [[LOAD5]], [[MULF1]] : f32
 | 
			
		||||
  // CHECK: store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: krnl.define_loops
 | 
			
		||||
  // CHECK: krnl.optimize_loops
 | 
			
		||||
  // CHECK: krnl.iterate
 | 
			
		||||
  // CHECK: [[LOAD6:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[LOAD7:%.+]] = load %arg1[%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[LOAD6:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[LOAD7:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: [[ADDF3:%.+]] = addf [[LOAD6]], [[LOAD7]] : f32
 | 
			
		||||
  // CHECK: store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
 | 
			
		||||
  // CHECK: dealloc [[MEMPOOL1]] : memref<400xi8>
 | 
			
		||||
  // CHECK: dealloc [[MEMPOOL0]] : memref<800xi8>
 | 
			
		||||
  // CHECK: return [[RES]] : memref<10x20xf32>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
				
			
			@ -16,10 +16,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Second Add
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -27,10 +27,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			@ -55,10 +55,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Second Mul
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -66,10 +66,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			@ -94,10 +94,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Second Div
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -105,10 +105,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			@ -133,10 +133,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Second Sub
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -144,10 +144,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			@ -172,10 +172,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
 | 
			
		||||
  // CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: affine.store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
 | 
			
		||||
  /// Second And
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -183,10 +183,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
 | 
			
		||||
  // CHECK: store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: affine.store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xi1>
 | 
			
		||||
| 
						 | 
				
			
			@ -211,10 +211,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
 | 
			
		||||
  // CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: affine.store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
 | 
			
		||||
  /// Second Or
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -222,10 +222,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
 | 
			
		||||
  // CHECK: store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: affine.store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xi1>
 | 
			
		||||
| 
						 | 
				
			
			@ -250,10 +250,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
 | 
			
		||||
  // CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: affine.store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
 | 
			
		||||
  /// Second Xor
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -261,10 +261,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
 | 
			
		||||
  // CHECK: store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
  // CHECK: affine.store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xi1>
 | 
			
		||||
| 
						 | 
				
			
			@ -585,10 +585,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Second Sum
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -596,10 +596,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			@ -624,11 +624,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  
 | 
			
		||||
  /// Second Max
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -636,11 +636,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			@ -665,11 +665,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  
 | 
			
		||||
  /// Second Min
 | 
			
		||||
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
 | 
			
		||||
| 
						 | 
				
			
			@ -677,11 +677,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
 | 
			
		|||
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
 | 
			
		||||
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
 | 
			
		||||
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
 | 
			
		||||
  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  // CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
 | 
			
		||||
  
 | 
			
		||||
  /// Dealloc of first result.
 | 
			
		||||
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue