Replace std.load/std.store by affine.load/affine.store (#180)

* Move to more recent LLVM ID (May 15) * clang-format * Bump cache version up * Update readme * Fix doc check * Move to a newer commit id * Update LoopToStandard -> SCFToStandard * Change MLIRSideEffects to MLIRSideEffectInterfaces * Add AffineScope trait to KrnlIterateOp * [ElementWise] Load/Store op to AffineLoad/AffineStore op * [Gemm, MatMul, Reduction, Softmax] Load/Store op to AffineLoad/AffineStore op * [Concat] Load/Store op to AffineLoad/AffineStore op * [Pad, PadConstantValuePad, Reshape, Transpose] Load/Store op to AffineLoad/AffineStore op * [LSTM] Load/Store op to AffineLoad/AffineStore op * [Conv, Norm, Pooling] Load/Store op to AffineLoad/AffineStore op * Add affine-loop-fusion pass * Use Load/Store for scalar * Use Load/Store for scalar * Fix lit tests * Unknown dimensions for broadcasting ops * Affine Load/Store for scalar memref * clang-format Co-authored-by: Gheorghe-Teodor Bercea <gt.bercea@gmail.com> Co-authored-by: Tian Jin <tjingrant@gmail.com>
2020-07-05 17:20:21 +09:00 · 2020-07-05 17:20:21 +09:00 · 7e05f371de
parent 2c8f5701bd
commit 7e05f371de
20 changed files with 494 additions and 469 deletions
--- a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
@ -619,20 +619,35 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
      for (auto arg : iterationBlock.getArguments())
        loopIVs.push_back(arg);
    }
-    // Fold over operands for each of their scalar values
+    // Fold over operands for each of their scalar values.
    Value accumulated, next;
-    auto accumulatedLoopIVs = getLoopIVsForBroadcasting(
+    // Obtain the first operand.
+    std::vector<Value> accumulatedLoopIVs = getLoopIVsForBroadcasting(
        loc, rewriter, loopIVs, operands[0], broadcastedDimInfo[0]);
-    accumulated = rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
+    if (!hasAllConstantDimensions(memRefType))
+      // In case of unknown dimensions, use std.load since
+      // 'getLoopIVsForBroadcasting' has not supported affine map so far.
+      accumulated =
+          rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
+    else
+      accumulated =
+          rewriter.create<AffineLoadOp>(loc, operands[0], accumulatedLoopIVs);
+    // Iterate over the remaining operands.
    for (unsigned i = 1; i < numArgs; i++) {
-      auto nextLoopIVs = getLoopIVsForBroadcasting(
+      std::vector<Value> nextLoopIVs = getLoopIVsForBroadcasting(
          loc, rewriter, loopIVs, operands[i], broadcastedDimInfo[i]);
-      next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
+      if (!hasAllConstantDimensions(memRefType))
+        // In case of unknown dimensions, use std.load since
+        // 'getLoopIVsForBroadcasting' has not supported affine map so far.
+        next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
+      else
+        next = rewriter.create<AffineLoadOp>(loc, operands[i], nextLoopIVs);
      accumulated = emitScalarOpFor<ElementwiseVariadicOp>(
          rewriter, loc, op, memRefType.getElementType(), {accumulated, next});
    }
+
    // Store result in the resulting array.
-    rewriter.create<StoreOp>(loc, accumulated, alloc, loopIVs);
+    rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopIVs);

    rewriter.replaceOp(op, alloc);

--- a/src/Conversion/ONNXToKrnl/Math/Gemm.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Gemm.cpp
@ -156,23 +156,23 @@ struct ONNXGemmOpLowering : public ConversionPattern {

    // Initialize the output of A*B
    auto zero = emitConstantOp(rewriter, loc, memRefType.getElementType(), 0);
-    rewriter.create<StoreOp>(loc, zero, alloc, loopMNIVs);
+    rewriter.create<AffineStoreOp>(loc, zero, alloc, loopMNIVs);

    // Compute A*B
    auto matmulIterateOp = rewriter.create<KrnlIterateOp>(loc, reductionPack);

    // Compute beta*C, and add up to alpha*A*B (unidirectional broadcasting)
-    auto loadedAB = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
+    auto loadedAB = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
    auto alphaAB = rewriter.create<MulFOp>(loc, alpha, loadedAB);
    if (hasBias) {
      auto loopCIVs = getLoopIVsForBroadcasting(
          loc, rewriter, loopMNIVs, C, broadcastedDimInfo);
-      auto loadedC = rewriter.create<LoadOp>(loc, C, loopCIVs);
+      auto loadedC = rewriter.create<AffineLoadOp>(loc, C, loopCIVs);
      auto betaC = rewriter.create<MulFOp>(loc, beta, loadedC);
      auto Y = rewriter.create<AddFOp>(loc, alphaAB, betaC);
-      rewriter.create<StoreOp>(loc, Y, alloc, loopMNIVs);
+      rewriter.create<AffineStoreOp>(loc, Y, alloc, loopMNIVs);
    } else {
-      rewriter.create<StoreOp>(loc, alphaAB, alloc, loopMNIVs);
+      rewriter.create<AffineStoreOp>(loc, alphaAB, alloc, loopMNIVs);
    }

    // Insert instructions to do matrix multiplication: A*B
@ -199,12 +199,12 @@ struct ONNXGemmOpLowering : public ConversionPattern {
    }

    // Matmul computation
-    auto loadedA = rewriter.create<LoadOp>(loc, A, loopAIVs);
-    auto loadedB = rewriter.create<LoadOp>(loc, B, loopBIVs);
-    auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
+    auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopAIVs);
+    auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBIVs);
+    auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
    auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
    auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
-    rewriter.create<StoreOp>(loc, accumulated, alloc, loopMNIVs);
+    rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopMNIVs);

    rewriter.replaceOp(op, alloc);

--- a/src/Conversion/ONNXToKrnl/Math/MatMul.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/MatMul.cpp
@ -221,7 +221,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
      }

      // Fill the output with value 0.
-      rewriter.create<StoreOp>(loc, zero, alloc, loopBatchMNIVs);
+      rewriter.create<AffineStoreOp>(loc, zero, alloc, loopBatchMNIVs);

      //  Iterate along the reduction dimension.
      //  Use a value from A.
@ -265,17 +265,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
          loopBatchKNIVs.emplace_back(loopMNIVs[0]);
      }
      // Matmul computation
-      auto loadedA = rewriter.create<LoadOp>(loc, A, loopBatchMKIVs);
-      auto loadedB = rewriter.create<LoadOp>(loc, B, loopBatchKNIVs);
-      auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopBatchMNIVs);
+      auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopBatchMKIVs);
+      auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBatchKNIVs);
+      auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopBatchMNIVs);
      if (elementType.isa<IntegerType>()) {
        auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
        auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
-        rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
+        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
      } else if (elementType.isa<FloatType>()) {
        auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
        auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
-        rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
+        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
      }
    } else if ((AShape.size() == 1) && (BShape.size() == 1)) {
      // Case 3:
@ -283,7 +283,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {

      // Fill the output with value 0.
      Value zeroIndex = rewriter.create<ConstantIndexOp>(loc, 0);
-      rewriter.create<StoreOp>(loc, zero, alloc, zeroIndex);
+      rewriter.create<AffineStoreOp>(loc, zero, alloc, zeroIndex);

      //  Iterate along the reduction dimension.
      //  Use a value from A.
@ -310,17 +310,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
      loopKIVs.emplace_back(reduceIterationBlock.getArgument(0));

      // Matmul computation
-      auto loadedA = rewriter.create<LoadOp>(loc, A, loopKIVs);
-      auto loadedB = rewriter.create<LoadOp>(loc, B, loopKIVs);
-      auto loadedY = rewriter.create<LoadOp>(loc, alloc, zeroIndex);
+      auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopKIVs);
+      auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopKIVs);
+      auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, zeroIndex);
      if (elementType.isa<IntegerType>()) {
        auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
        auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
-        rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
+        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
      } else if (elementType.isa<FloatType>()) {
        auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
        auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
-        rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
+        rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
      }
    } else {
      // No scalar matrix multiplication.
--- a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
@ -212,7 +212,7 @@ struct ONNXReductionOpLowering : public ConversionPattern {

    Value identity =
        getIdentityValue<ONNXReductionOp>(rewriter, loc, elementOutType);
-    rewriter.create<StoreOp>(loc, identity, alloc, loopIVs);
+    rewriter.create<AffineStoreOp>(loc, identity, alloc, loopIVs);

    // Define an Krnl loop to do reduction.
    rewriter.setInsertionPointAfter(iterateOpInit);
@ -256,11 +256,11 @@ struct ONNXReductionOpLowering : public ConversionPattern {
    }

    Value next, accumulated;
-    next = rewriter.create<LoadOp>(loc, operands[0], inLoopIVs);
-    accumulated = rewriter.create<LoadOp>(loc, alloc, outLoopIVs);
+    next = rewriter.create<AffineLoadOp>(loc, operands[0], inLoopIVs);
+    accumulated = rewriter.create<AffineLoadOp>(loc, alloc, outLoopIVs);
    accumulated = emitScalarOpFor<ONNXReductionOp>(
        rewriter, loc, op, memRefOutType.getElementType(), {accumulated, next});
-    rewriter.create<StoreOp>(loc, accumulated, alloc, outLoopIVs);
+    rewriter.create<AffineStoreOp>(loc, accumulated, alloc, outLoopIVs);

    rewriter.replaceOp(op, alloc);
    return success();
--- a/src/Conversion/ONNXToKrnl/Math/Softmax.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Softmax.cpp
@ -104,8 +104,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
        outerLoopIVs.push_back(arg);

      // Reset accumulators.
-      rewriter.create<StoreOp>(loc, zero, sumOp);
-      rewriter.create<StoreOp>(loc, negInfinity, maxOp);
+      rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
+      rewriter.create<AffineStoreOp>(
+          loc, negInfinity, maxOp, ArrayRef<Value>{});

      // Create an inner loop to compute max.
      maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
@ -115,8 +116,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
      softmaxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
    } else {
      // Reset accumulators.
-      rewriter.create<StoreOp>(loc, zero, sumOp);
-      rewriter.create<StoreOp>(loc, negInfinity, maxOp);
+      rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
+      rewriter.create<AffineStoreOp>(
+          loc, negInfinity, maxOp, ArrayRef<Value>{});

      // Create an inner loop to compute max.
      maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
@ -142,16 +144,16 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
      maxLoopIVs.push_back(arg);

    // Compute the max value.
-    Value max = rewriter.create<LoadOp>(loc, maxOp);
-    Value nextMax = rewriter.create<LoadOp>(loc, input, maxLoopIVs);
+    Value max = rewriter.create<AffineLoadOp>(loc, maxOp);
+    Value nextMax = rewriter.create<AffineLoadOp>(loc, input, maxLoopIVs);
    auto maxCond =
        rewriter.create<CmpFOp>(loc, CmpFPredicate::OGT, max, nextMax);
    max = rewriter.create<SelectOp>(loc, maxCond, max, nextMax);
-    rewriter.create<StoreOp>(loc, max, maxOp);
+    rewriter.create<AffineStoreOp>(loc, max, maxOp, ArrayRef<Value>{});

    // Get the max.
    rewriter.setInsertionPoint(sumIterateOp);
-    max = rewriter.create<LoadOp>(loc, maxOp);
+    max = rewriter.create<AffineLoadOp>(loc, maxOp);

    // Insert instructions inside the sum loop.
    Block &sumIterationBlock = sumIterateOp.bodyRegion().front();
@ -165,18 +167,18 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
      sumLoopIVs.push_back(arg);

    // Sum up values.
-    Value sum = rewriter.create<LoadOp>(loc, sumOp);
-    Value next = rewriter.create<LoadOp>(loc, input, sumLoopIVs);
+    Value sum = rewriter.create<AffineLoadOp>(loc, sumOp);
+    Value next = rewriter.create<AffineLoadOp>(loc, input, sumLoopIVs);
    Value sub = rewriter.create<SubFOp>(loc, next, max);
    Value exp = rewriter.create<ExpOp>(loc, sub);
    sum = rewriter.create<AddFOp>(loc, sum, exp);
-    rewriter.create<StoreOp>(loc, sum, sumOp);
+    rewriter.create<AffineStoreOp>(loc, sum, sumOp, ArrayRef<Value>{});
    // Store intermediate values in the result to avoid recomputation.
-    rewriter.create<StoreOp>(loc, exp, alloc, sumLoopIVs);
+    rewriter.create<AffineStoreOp>(loc, exp, alloc, sumLoopIVs);

    // Get the sum.
    rewriter.setInsertionPoint(softmaxIterateOp);
-    sum = rewriter.create<LoadOp>(loc, sumOp);
+    sum = rewriter.create<AffineLoadOp>(loc, sumOp);

    // Insert instructions inside the softmax loop.
    Block &softmaxIterationBlock = softmaxIterateOp.bodyRegion().front();
@ -190,9 +192,10 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
      softmaxLoopIVs.push_back(arg);

    // Compute softmax.
-    Value expLoadedVal = rewriter.create<LoadOp>(loc, alloc, softmaxLoopIVs);
+    Value expLoadedVal =
+        rewriter.create<AffineLoadOp>(loc, alloc, softmaxLoopIVs);
    Value result = rewriter.create<DivFOp>(loc, expLoadedVal, sum);
-    rewriter.create<StoreOp>(loc, result, alloc, softmaxLoopIVs);
+    rewriter.create<AffineStoreOp>(loc, result, alloc, softmaxLoopIVs);

    rewriter.replaceOp(op, alloc);

--- a/src/Conversion/ONNXToKrnl/NN/Conv.cpp
+++ b/src/Conversion/ONNXToKrnl/NN/Conv.cpp
@ -129,10 +129,14 @@ struct ONNXConvOpLowering : public ConversionPattern {
      if (group > 1) {
        // Middle loop is over groups and third loop is over the
        // kernel identifiers in the current group.
-        auto kernelsOffset = rewriter.create<MulIOp>(
-            loc, outerLoops.getInductionVar(gIndex), kernelsPerGroupValue);
-        kernel = rewriter.create<AddIOp>(
-            loc, kernelsOffset, outerLoops.getInductionVar(mIndex));
+        AffineMap kernelMap = AffineMap::get(2, 1,
+            /*gIndex=*/rewriter.getAffineDimExpr(0) *
+                    /*kernelsPerGroup=*/rewriter.getAffineSymbolExpr(0) +
+                /*mIndex=*/rewriter.getAffineDimExpr(1));
+        kernel = rewriter.create<AffineApplyOp>(loc, kernelMap,
+            ArrayRef<Value>{/*gIndex=*/outerLoops.getInductionVar(gIndex),
+                /*kernelsPerGroupValue=*/kernelsPerGroupValue,
+                /*mIndex=*/outerLoops.getInductionVar(mIndex)});
      }

      // 2.2 Define spatial loops
@ -209,9 +213,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
                        /*subchannel=*/rewriter.getAffineSymbolExpr(0) +
                    /*c=*/rewriter.getAffineDimExpr(1));
            channelDepth = rewriter.create<AffineApplyOp>(loc, indexMap,
-                ValueRange(
-                    ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
-                        /*c=*/channelDepth, /*subchannel=*/subchannels}));
+                ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
+                    /*c=*/channelDepth, /*subchannel=*/subchannels});
          }
          dataIndices.emplace_back(channelDepth);
          // sX * rX + kX
@ -231,8 +234,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
                /*sX=*/rewriter.getAffineDimExpr(0) * /*rX=*/stride +
                    /*kX=*/rewriter.getAffineDimExpr(1));
            Value outIV = rewriter.create<AffineApplyOp>(loc, indexMap,
-                ValueRange(ArrayRef<Value>{spatialLoops.getInductionVar(i),
-                    innerLoops.getInductionVar(i + 1)}));
+                ArrayRef<Value>{spatialLoops.getInductionVar(i),
+                    innerLoops.getInductionVar(i + 1)});
            dataIndices.emplace_back(outIV);
          }

--- a/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
+++ b/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
@ -79,10 +79,10 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
      loopCIVs.emplace_back(rewriter.create<ConstantIndexOp>(loc, 0));
    }

-    auto scaleVal = rewriter.create<LoadOp>(loc, scale, loopCIVs);
-    auto biasVal = rewriter.create<LoadOp>(loc, bias, loopCIVs);
-    auto meanVal = rewriter.create<LoadOp>(loc, mean, loopCIVs);
-    auto varianceVal = rewriter.create<LoadOp>(loc, variance, loopCIVs);
+    auto scaleVal = rewriter.create<AffineLoadOp>(loc, scale, loopCIVs);
+    auto biasVal = rewriter.create<AffineLoadOp>(loc, bias, loopCIVs);
+    auto meanVal = rewriter.create<AffineLoadOp>(loc, mean, loopCIVs);
+    auto varianceVal = rewriter.create<AffineLoadOp>(loc, variance, loopCIVs);

    // Create a KrnlIterateOp along the other dimensions.
    SmallVector<int64_t, 4> axes;
@ -118,7 +118,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
      loopIVs.emplace_back(args[0]);
    }

-    auto xVal = rewriter.create<LoadOp>(loc, operand, loopIVs);
+    auto xVal = rewriter.create<AffineLoadOp>(loc, operand, loopIVs);
    // normalize
    auto dividend = rewriter.create<SubFOp>(loc, xVal, meanVal);
    auto adjustedVarianceVal =
@ -129,7 +129,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
    auto scaleNormVal = rewriter.create<MulFOp>(loc, scaleVal, normVal);
    auto shiftScaleNormVal =
        rewriter.create<AddFOp>(loc, scaleNormVal, biasVal);
-    rewriter.create<StoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
+    rewriter.create<AffineStoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);

    rewriter.replaceOp(op, alloc);

--- a/src/Conversion/ONNXToKrnl/NN/Pooling.cpp
+++ b/src/Conversion/ONNXToKrnl/NN/Pooling.cpp
@ -100,7 +100,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
    ArrayRef<Value> poolDimValues) {
  // AveragePool's result type is FloatType, so it's safe to use DivFOp, SubFOp.
  bool countIncludePad = getCountIncludePad<ONNXAveragePoolOp>(poolOp);
-  Value numerator = rewriter.create<LoadOp>(loc, alloc, resultIndices);
+  Value numerator = rewriter.create<AffineLoadOp>(loc, alloc, resultIndices);
  Value denominator;
  if (countIncludePad) {
    int64_t kernelSize = 1;
@ -120,7 +120,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(

  Value average = rewriter.create<DivFOp>(loc, numerator, denominator);

-  rewriter.create<StoreOp>(loc, average, alloc, resultIndices);
+  rewriter.create<AffineStoreOp>(loc, average, alloc, resultIndices);
 }

 //===----------------------------------------------------------------------===//
@ -167,9 +167,7 @@ Value insertAllocAndDeallocForPooling(ConversionPatternRewriter &rewriter,
              dilations.empty() ? 1 : dilations[spatialIndex]));

      // Apply the affine map.
-      Value dimVal =
-          rewriter.create<AffineApplyOp>(loc, dimMap, ValueRange(dimArgs));
-
+      Value dimVal = rewriter.create<AffineApplyOp>(loc, dimMap, dimArgs);
      allocOperands.emplace_back(dimVal);
    }
  }
@ -346,7 +344,7 @@ struct ONNXPoolOpLowering : public ConversionPattern {
        outputIndices.emplace_back(outputLoops.getInductionVar(i));

      // 2.1 Emit: output[n][c][ho][wo] = identity
-      rewriter.create<StoreOp>(loc, identity, alloc, outputIndices);
+      rewriter.create<AffineStoreOp>(loc, identity, alloc, outputIndices);

      // 2.2 Emit affine maps which express the lower and upper bounds for the
      // pooling window's dimensions.
@ -441,11 +439,11 @@ struct ONNXPoolOpLowering : public ConversionPattern {
      { // Construct poolStartValues and poolDimValues.
        for (int i = 0; i < kernelShape.size(); ++i) {
          Value startIndex = rewriter.create<AffineMaxOp>(
-              loc, poolStartMap, ValueRange(IVsAndConstants[i]));
+              loc, poolStartMap, IVsAndConstants[i]);
          poolStartValues.emplace_back(startIndex);

-          Value endIndex = rewriter.create<AffineMinOp>(
-              loc, poolEndMap, ValueRange(IVsAndConstants[i]));
+          Value endIndex =
+              rewriter.create<AffineMinOp>(loc, poolEndMap, IVsAndConstants[i]);

          Value dim = rewriter.create<SubIOp>(loc, endIndex, startIndex);
          if (isDilated) {
@ -514,10 +512,10 @@ struct ONNXPoolOpLowering : public ConversionPattern {
        Value loadInput =
            rewriter.create<LoadOp>(loc, inputOperand, inputIndices);
        Value loadPartialOutput =
-            rewriter.create<LoadOp>(loc, alloc, outputIndices);
+            rewriter.create<AffineLoadOp>(loc, alloc, outputIndices);
        Value output = emitScalarOpFor<PoolOp>(rewriter, loc, op,
            outputElementType, {loadPartialOutput, loadInput});
-        rewriter.create<StoreOp>(loc, output, alloc, outputIndices);
+        rewriter.create<AffineStoreOp>(loc, output, alloc, outputIndices);
      }

      // 2.5 Post-processing for the pooling window, e.g. taking average.
--- a/src/Conversion/ONNXToKrnl/RNN/LSTM.cpp
+++ b/src/Conversion/ONNXToKrnl/RNN/LSTM.cpp
@ -222,13 +222,15 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(

    Value hiddenVal = zero;
    if (!isNoneType(operandAdaptor.initial_h()))
-      hiddenVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_h(), IVs);
-    rewriter.create<StoreOp>(loc, hiddenVal, state.ht, IVs);
+      hiddenVal =
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_h(), IVs);
+    rewriter.create<AffineStoreOp>(loc, hiddenVal, state.ht, IVs);

    Value cellVal = zero;
    if (!isNoneType(operandAdaptor.initial_c()))
-      cellVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_c(), IVs);
-    rewriter.create<StoreOp>(loc, cellVal, state.ct, IVs);
+      cellVal =
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_c(), IVs);
+    rewriter.create<AffineStoreOp>(loc, cellVal, state.ct, IVs);
  }
  rewriter.restoreInsertionPoint(ipInitializationLoops);
  return state;
@ -320,8 +322,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
        for (unsigned i = 0; i < 4; ++i) {
          Value wHiddenIV =
              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
-                  ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
-                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
+                  std::vector<Value>{/*iv=*/hiddenIV,
+                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal});
          wbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, wHiddenIV});
        }
        // Rb[iofc]
@ -329,8 +331,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
          SmallVector<Value, 4> rbIVs;
          Value rHiddenIV =
              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
-                  ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
-                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
+                  std::vector<Value>{/*iv=*/hiddenIV,
+                      /*index=*/constantIndices[i], /*size=*/hiddenDimVal});
          rbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, rHiddenIV});
        }
      }
@ -339,17 +341,16 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
      if (hasPeepholes) {
        for (unsigned i = 0; i < 3; ++i) {
          SmallVector<Value, 4> pIVs;
-          Value pHiddenIV =
-              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
-                  ValueRange(std::vector<Value>{
-                      hiddenIV, constantIndices[i], hiddenDimVal}));
+          Value pHiddenIV = rewriter.create<AffineApplyOp>(loc,
+              accessByOffsetMap,
+              std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
          pIOFIVs.emplace_back(SmallVector<Value, 2>{directionIV, pHiddenIV});
        }
      }
    }

-    Value loadH = rewriter.create<LoadOp>(loc, state.ht, hIVs);
-    Value loadC = rewriter.create<LoadOp>(loc, state.ct, cIVs);
+    Value loadH = rewriter.create<AffineLoadOp>(loc, state.ht, hIVs);
+    Value loadC = rewriter.create<AffineLoadOp>(loc, state.ct, cIVs);

    // Emit instructions for matrix multiplications:
    //   Xt*(Wi^T), Xt*(Wo^T), Xt*(Wf^t), Xt*(Wc^T)
@ -361,9 +362,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
    MemRefType scalarMemRefType = MemRefType::get({}, elementType, {}, 0);
    for (unsigned i = 0; i < 4; ++i) {
      Value xwAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
-      rewriter.create<StoreOp>(loc, zero, xwAlloc);
+      rewriter.create<AffineStoreOp>(loc, zero, xwAlloc, ArrayRef<Value>{});
      Value hrAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
-      rewriter.create<StoreOp>(loc, zero, hrAlloc);
+      rewriter.create<AffineStoreOp>(loc, zero, hrAlloc, ArrayRef<Value>{});
      xwIOFC.emplace_back(xwAlloc);
      hrIOFC.emplace_back(hrAlloc);
    }
@ -390,10 +391,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
        // R[iofc] :: [num_directions, 4*hidden_size, input_size]
        for (unsigned i = 0; i < 4; ++i) {
          SmallVector<Value, 4> wIVs, rIVs;
-          Value wHiddenIV =
-              rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
-                  ValueRange(std::vector<Value>{
-                      hiddenIV, constantIndices[i], hiddenDimVal}));
+          Value wHiddenIV = rewriter.create<AffineApplyOp>(loc,
+              accessByOffsetMap,
+              std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});

          wIVs = {directionIV, wHiddenIV, reductionIV};
          wIOFCIVs.emplace_back(wIVs);
@ -402,77 +402,80 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
          rIOFCIVs.emplace_back(rIVs);
        }

-        Value loadX = rewriter.create<LoadOp>(loc, operandAdaptor.X(), xIVs);
+        Value loadX =
+            rewriter.create<AffineLoadOp>(loc, operandAdaptor.X(), xIVs);
        for (unsigned i = 0; i < 4; ++i) {
          // Xt * Wiofc
-          Value loadW =
-              rewriter.create<LoadOp>(loc, operandAdaptor.W(), wIOFCIVs[i]);
+          Value loadW = rewriter.create<AffineLoadOp>(
+              loc, operandAdaptor.W(), wIOFCIVs[i]);
          Value xwVal = rewriter.create<MulFOp>(loc, loadX, loadW);
-          Value loadXW = rewriter.create<LoadOp>(loc, xwIOFC[i]);
+          Value loadXW = rewriter.create<AffineLoadOp>(loc, xwIOFC[i]);
          Value nextXW = rewriter.create<AddFOp>(loc, loadXW, xwVal);
-          rewriter.create<StoreOp>(loc, nextXW, xwIOFC[i]);
+          rewriter.create<AffineStoreOp>(
+              loc, nextXW, xwIOFC[i], ArrayRef<Value>{});
          // Ht-1 * Riofc
-          Value loadR =
-              rewriter.create<LoadOp>(loc, operandAdaptor.R(), rIOFCIVs[i]);
+          Value loadR = rewriter.create<AffineLoadOp>(
+              loc, operandAdaptor.R(), rIOFCIVs[i]);
          Value hrVal = rewriter.create<MulFOp>(loc, loadH, loadR);
-          Value loadHR = rewriter.create<LoadOp>(loc, hrIOFC[i]);
+          Value loadHR = rewriter.create<AffineLoadOp>(loc, hrIOFC[i]);
          Value nextHR = rewriter.create<AddFOp>(loc, loadHR, hrVal);
-          rewriter.create<StoreOp>(loc, nextHR, hrIOFC[i]);
+          rewriter.create<AffineStoreOp>(
+              loc, nextHR, hrIOFC[i], ArrayRef<Value>{});
        }
      }
      rewriter.restoreInsertionPoint(ipReductionLoops);
    }

    // it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
-    Value loadXWI = rewriter.create<LoadOp>(loc, xwIOFC[0]);
-    Value loadHRI = rewriter.create<LoadOp>(loc, hrIOFC[0]);
+    Value loadXWI = rewriter.create<AffineLoadOp>(loc, xwIOFC[0]);
+    Value loadHRI = rewriter.create<AffineLoadOp>(loc, hrIOFC[0]);
    Value it = rewriter.create<AddFOp>(loc, loadXWI, loadHRI);
    if (hasPeepholes) {
      Value loadP =
-          rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
      Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
      it = rewriter.create<AddFOp>(loc, it, PC);
    }
    if (hasBiasForInput) {
      Value loadWB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
      it = rewriter.create<AddFOp>(loc, it, loadWB);
      Value loadRB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
      it = rewriter.create<AddFOp>(loc, it, loadRB);
    }
    it = applyActivation(rewriter, loc, activationPack.f, it);

    // ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
-    Value loadXWF = rewriter.create<LoadOp>(loc, xwIOFC[2]);
-    Value loadHRF = rewriter.create<LoadOp>(loc, hrIOFC[2]);
+    Value loadXWF = rewriter.create<AffineLoadOp>(loc, xwIOFC[2]);
+    Value loadHRF = rewriter.create<AffineLoadOp>(loc, hrIOFC[2]);
    Value ft = rewriter.create<AddFOp>(loc, loadXWF, loadHRF);
    if (hasPeepholes) {
      Value loadP =
-          rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
      Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
      ft = rewriter.create<AddFOp>(loc, ft, PC);
    }
    if (hasBiasForInput) {
      Value loadWB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
      ft = rewriter.create<AddFOp>(loc, ft, loadWB);
      Value loadRB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
      ft = rewriter.create<AddFOp>(loc, ft, loadRB);
    }
    ft = applyActivation(rewriter, loc, activationPack.f, ft);

    // ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
-    Value loadXWC = rewriter.create<LoadOp>(loc, xwIOFC[3]);
-    Value loadHRC = rewriter.create<LoadOp>(loc, hrIOFC[3]);
+    Value loadXWC = rewriter.create<AffineLoadOp>(loc, xwIOFC[3]);
+    Value loadHRC = rewriter.create<AffineLoadOp>(loc, hrIOFC[3]);
    Value ct = rewriter.create<AddFOp>(loc, loadXWC, loadHRC);
    if (hasBiasForInput) {
      Value loadWB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
      ct = rewriter.create<AddFOp>(loc, ct, loadWB);
      Value loadRB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
      ct = rewriter.create<AddFOp>(loc, ct, loadRB);
    }
    ct = applyActivation(rewriter, loc, activationPack.g, ct);
@ -481,24 +484,24 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
    Value FtCt1 = rewriter.create<MulFOp>(loc, ft, loadC);
    Value itct = rewriter.create<MulFOp>(loc, it, ct);
    Value Ct = rewriter.create<AddFOp>(loc, FtCt1, itct);
-    rewriter.create<StoreOp>(loc, Ct, state.ct, cIVs);
+    rewriter.create<AffineStoreOp>(loc, Ct, state.ct, cIVs);

    // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
-    Value loadXWO = rewriter.create<LoadOp>(loc, xwIOFC[1]);
-    Value loadHRO = rewriter.create<LoadOp>(loc, hrIOFC[1]);
+    Value loadXWO = rewriter.create<AffineLoadOp>(loc, xwIOFC[1]);
+    Value loadHRO = rewriter.create<AffineLoadOp>(loc, hrIOFC[1]);
    Value ot = rewriter.create<AddFOp>(loc, loadXWO, loadHRO);
    if (hasPeepholes) {
      Value loadP =
-          rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
      Value PC = rewriter.create<MulFOp>(loc, loadP, Ct);
      ot = rewriter.create<AddFOp>(loc, ot, PC);
    }
    if (hasBiasForInput) {
      Value loadWB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
      ot = rewriter.create<AddFOp>(loc, ot, loadWB);
      Value loadRB =
-          rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
+          rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
      ot = rewriter.create<AddFOp>(loc, ot, loadRB);
    }
    ot = applyActivation(rewriter, loc, activationPack.f, ot);
@ -506,12 +509,12 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
    // Ht = ot (.) h(Ct)
    Value hCt = applyActivation(rewriter, loc, activationPack.h, Ct);
    Value Ht = rewriter.create<MulFOp>(loc, ot, hCt);
-    rewriter.create<StoreOp>(loc, Ht, state.ht, hIVs);
+    rewriter.create<AffineStoreOp>(loc, Ht, state.ht, hIVs);

    // Store the current Ht if required.
    if (!isNoneType(state.allH)) {
      SmallVector<Value, 4> allHIVs{sequenceIV, directionIV, batchIV, hiddenIV};
-      rewriter.create<StoreOp>(loc, Ht, state.allH, allHIVs);
+      rewriter.create<AffineStoreOp>(loc, Ht, state.allH, allHIVs);
    }

    // Deallocate the temporary results of matrix multiplications.
--- a/src/Conversion/ONNXToKrnl/RNN/RNNBase.cpp
+++ b/src/Conversion/ONNXToKrnl/RNN/RNNBase.cpp
@ -28,7 +28,7 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
  MemRefType scalarMemRefType =
      MemRefType::get({}, scalarOperand.getType(), {}, 0);
  Value alloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
-  rewriter.create<StoreOp>(loc, scalarOperand, alloc);
+  rewriter.create<AffineStoreOp>(loc, scalarOperand, alloc, ArrayRef<Value>{});

  std::vector<mlir::NamedAttribute> attributes;
  if (activation.alpha) {
@ -68,6 +68,6 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
  else
    llvm_unreachable("Unsupported activation");

-  Value result = rewriter.create<LoadOp>(loc, res);
+  Value result = rewriter.create<AffineLoadOp>(loc, res);
  return result;
 }
--- a/src/Conversion/ONNXToKrnl/RNN/RNNBase.hpp
+++ b/src/Conversion/ONNXToKrnl/RNN/RNNBase.hpp
@ -126,9 +126,9 @@ struct ONNXRNNOpLowering : public ConversionPattern {
            rewriter.getIndexType(), (direction == REVERSE) ? 0 : 1);
        Value reverseSequenceIV =
            rewriter.create<AffineApplyOp>(loc, reverseIVMap,
-                ValueRange(std::vector<Value>{sequenceLoops.getInductionVar(0),
+                std::vector<Value>{sequenceLoops.getInductionVar(0),
                    emitConstantOp(rewriter, loc, rewriter.getIndexType(),
-                        sequenceDimSize)}));
+                        sequenceDimSize)});
        // Emit calculation for one RNN step.
        calculateState<RNNOp, S, A>(rewriter, loc, operandAdaptor, state,
            activationReverse, directionIV, reverseSequenceIV);
--- a/src/Conversion/ONNXToKrnl/Tensor/Concat.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Concat.cpp
@ -59,15 +59,18 @@ struct ONNXConcatOpLowering : public ConversionPattern {
        if (r != axis || writeOffset == 0) {
          writeIndices.emplace_back(inputLoops.getInductionVar(r));
        } else {
-          auto indexWithOffset = rewriter.create<AddIOp>(loc,
-              rewriter.create<ConstantIndexOp>(loc, writeOffset),
-              inputLoops.getInductionVar(r));
+          AffineMap indexWithOffsetMap =
+              AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + writeOffset);
+          Value indexWithOffset =
+              rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
+                  ArrayRef<Value>{inputLoops.getInductionVar(r)});
          writeIndices.emplace_back(indexWithOffset);
        }
      }
      // Insert copy.
-      auto loadData = rewriter.create<LoadOp>(loc, operands[i], readIndices);
-      rewriter.create<StoreOp>(loc, loadData, alloc, writeIndices);
+      auto loadData =
+          rewriter.create<AffineLoadOp>(loc, operands[i], readIndices);
+      rewriter.create<AffineStoreOp>(loc, loadData, alloc, writeIndices);
      // Increment offset
      writeOffset += currShape[axis];
    }
--- a/src/Conversion/ONNXToKrnl/Tensor/Pad.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Pad.cpp
@ -88,16 +88,17 @@ struct ONNXPadOpLowering : public ConversionPattern {
      if (pads[i] == 0) {
        outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
      } else {
-        auto outIV = rewriter.create<AddIOp>(loc,
-            rewriter.create<ConstantIndexOp>(loc, pads[i]),
-            valueLoops.getInductionVar(i));
+        AffineMap indexWithOffsetMap =
+            AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pads[i]);
+        Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
+            ArrayRef<Value>{valueLoops.getInductionVar(i)});
        outLoopIVs.emplace_back(outIV);
      }
    }

    auto originValue =
-        rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
-    rewriter.create<StoreOp>(loc, originValue, alloc, outLoopIVs);
+        rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
+    rewriter.create<AffineStoreOp>(loc, originValue, alloc, outLoopIVs);
    rewriter.setInsertionPointToStart(padLoops.getIterateBlock());

    SmallVector<Value, 4> outLoopIVs1;
@ -105,7 +106,7 @@ struct ONNXPadOpLowering : public ConversionPattern {
      outLoopIVs1.emplace_back(padLoops.getInductionVar(i));

    auto paddingValue = rewriter.create<ConstantOp>(loc, valueAttr);
-    rewriter.create<StoreOp>(loc, paddingValue, alloc, outLoopIVs1);
+    rewriter.create<AffineStoreOp>(loc, paddingValue, alloc, outLoopIVs1);

    // Replace the original op with the generated code.
    rewriter.replaceOp(op, alloc);
--- a/src/Conversion/ONNXToKrnl/Tensor/PadConstantValuePad.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/PadConstantValuePad.cpp
@ -77,15 +77,17 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
      if (pad_begin[i] == 0) {
        outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
      } else {
-        auto outIV = rewriter.create<AddIOp>(loc,
-            rewriter.create<ConstantIndexOp>(loc, pad_begin[i]),
-            valueLoops.getInductionVar(i));
+        AffineMap indexWithOffsetMap =
+            AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pad_begin[i]);
+        Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
+            ArrayRef<Value>{valueLoops.getInductionVar(i)});
        outLoopIVs.emplace_back(outIV);
      }
    }

-    auto inVal = rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
-    rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
+    auto inVal =
+        rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
+    rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
    rewriter.setInsertionPointToStart(padLoops.getIterateBlock());

    SmallVector<Value, 4> outLoopIVs1;
@ -93,7 +95,7 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
      outLoopIVs1.emplace_back(padLoops.getInductionVar(i));

    auto inVal1 = rewriter.create<ConstantOp>(loc, constantValAttr);
-    rewriter.create<StoreOp>(loc, inVal1, alloc, outLoopIVs1);
+    rewriter.create<AffineStoreOp>(loc, inVal1, alloc, outLoopIVs1);

    // Replace the original op with the generated code.
    rewriter.replaceOp(op, alloc);
--- a/src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp
@ -64,7 +64,8 @@ struct ONNXReshapeOpLowering : public ConversionPattern {
      for (int i = 0; i < memRefShape.size(); ++i) {
        Value index = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
        // Load index from array of indices.
-        Value loadedVal = rewriter.create<LoadOp>(loc, operands[1], index);
+        Value loadedVal =
+            rewriter.create<AffineLoadOp>(loc, operands[1], index);
        // If a dimension is zero, the actual dimension value is taken from the
        // input tensor.
        //
--- a/src/Conversion/ONNXToKrnl/Tensor/Split.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Split.cpp
@ -92,8 +92,9 @@ struct ONNXSplitOpLowering : public ConversionPattern {
        writeIndices.emplace_back(outputLoops.getInductionVar(r));
      }
      // Insert copy.
-      auto loadData = rewriter.create<LoadOp>(loc, operands[0], readIndices);
-      rewriter.create<StoreOp>(loc, loadData, allocs[i], writeIndices);
+      auto loadData =
+          rewriter.create<AffineLoadOp>(loc, operands[0], readIndices);
+      rewriter.create<AffineStoreOp>(loc, loadData, allocs[i], writeIndices);
    }
    rewriter.replaceOp(op, allocs);
    return success();
--- a/src/Conversion/ONNXToKrnl/Tensor/Transpose.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Transpose.cpp
@ -80,8 +80,8 @@ struct ONNXTransposeOpLowering : public ConversionPattern {
    for (int i = 0; i < iterationBlock.getArguments().size(); ++i)
      outLoopIVs.emplace_back(iterationBlock.getArguments()[perm[i]]);

-    auto inVal = rewriter.create<LoadOp>(loc, data, inLoopIVs);
-    rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
+    auto inVal = rewriter.create<AffineLoadOp>(loc, data, inLoopIVs);
+    rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);

    rewriter.replaceOp(op, alloc);

--- a/test/mlir/onnx/onnx_enable_memory_pool.mlir
+++ b/test/mlir/onnx/onnx_enable_memory_pool.mlir
@ -14,10 +14,10 @@ func @test_enable_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
  // CHECK: krnl.define_loops
  // CHECK: krnl.optimize_loops
  // CHECK: krnl.iterate
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
  // CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
+  // CHECK: affine.store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
  // CHECK: krnl.define_loops
  // CHECK: krnl.optimize_loops
  // CHECK: krnl.iterate
@ -43,26 +43,26 @@ func @test_enable_memory_pool_2(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf3
  // CHECK: krnl.define_loops
  // CHECK: krnl.optimize_loops
  // CHECK: krnl.iterate
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
  // CHECK: krnl.define_loops
  // CHECK: krnl.optimize_loops
  // CHECK: krnl.iterate
-  // CHECK: [[LOAD3:%.+]] = load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
-  // CHECK: [[LOAD4:%.+]] = load %arg1[%arg4, %arg3] : memref<10x20xf32>
-  // CHECK: [[LOAD5:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: [[LOAD3:%.+]] = affine.load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
+  // CHECK: [[LOAD4:%.+]] = affine.load %arg1[%arg4, %arg3] : memref<10x20xf32>
+  // CHECK: [[LOAD5:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
  // CHECK: [[MULF1:%.+]] = mulf [[LOAD3]], [[LOAD4]] : f32
  // CHECK: [[ADDF2:%.+]] = addf [[LOAD5]], [[MULF1]] : f32
-  // CHECK: store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: affine.store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
  // CHECK: krnl.define_loops
  // CHECK: krnl.optimize_loops
  // CHECK: krnl.iterate
-  // CHECK: [[LOAD6:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
-  // CHECK: [[LOAD7:%.+]] = load %arg1[%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: [[LOAD6:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: [[LOAD7:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x20xf32>
  // CHECK: [[ADDF3:%.+]] = addf [[LOAD6]], [[LOAD7]] : f32
-  // CHECK: store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: affine.store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
  // CHECK: dealloc [[MEMPOOL1]] : memref<400xi8>
  // CHECK: dealloc [[MEMPOOL0]] : memref<800xi8>
  // CHECK: return [[RES]] : memref<10x20xf32>
--- a/test/mlir/onnx/onnx_lowering.mlir
+++ b/test/mlir/onnx/onnx_lowering.mlir
--- a/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
+++ b/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
@ -16,10 +16,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Add
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -27,10 +27,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -55,10 +55,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Mul
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -66,10 +66,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -94,10 +94,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Div
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -105,10 +105,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -133,10 +133,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Sub
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -144,10 +144,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -172,10 +172,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
-  // CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: affine.store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>

  /// Second And
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -183,10 +183,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
-  // CHECK: store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: affine.store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xi1>
@ -211,10 +211,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
-  // CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: affine.store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>

  /// Second Or
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -222,10 +222,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
-  // CHECK: store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: affine.store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xi1>
@ -250,10 +250,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
-  // CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: affine.store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>

  /// Second Xor
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -261,10 +261,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
-  // CHECK: store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
+  // CHECK: affine.store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xi1>
@ -585,10 +585,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Sum
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -596,10 +596,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -624,11 +624,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Second Max
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -636,11 +636,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -665,11 +665,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Second Min
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -677,11 +677,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Dealloc of first result.
  // CHECK: dealloc [[RES]] : memref<10x10xf32>