From b27e57cc4f4fdc3ba955639a44026265592e5e09 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <gt.bercea@gmail.com>
Date: Mon, 20 Jul 2020 19:24:17 -0400
Subject: [PATCH] Emit allocs at the top of functions (#222)

* Reorganize main function.

* Follow review comments.

* Emit constants are globals in Krnl and LLVM dialects.

* Add support for moving dynamic alloca instructions to top of functions.

* Fix memory pooling tests.

* Various fixes.

* Fix lit tests.

* More test fixes.

* Reformat.

* Reformat some more.

* Fix issue with TestConv and split-input-file.

* Use smart pointers.

* Remove redundant pointer.

* Reformat.

* Add initMap description.

* Clean up tests.
---
 .../ONNXToKrnl/ConvertONNXToKrnl.cpp          |  49 +-
 .../ONNXToKrnl/Math/Elementwise.cpp           |  12 +-
 src/Conversion/ONNXToKrnl/Math/Gemm.cpp       |   3 +-
 src/Conversion/ONNXToKrnl/Math/MatMul.cpp     |  13 +-
 src/Conversion/ONNXToKrnl/Math/Reduction.cpp  |   4 +-
 src/Conversion/ONNXToKrnl/Math/Softmax.cpp    |  11 +-
 src/Conversion/ONNXToKrnl/NN/Conv.cpp         |   5 +-
 .../ONNXToKrnl/NN/Normalization.cpp           |   5 +-
 src/Conversion/ONNXToKrnl/NN/Pooling.cpp      |   3 +-
 .../ONNXToKrnl/ONNXToKrnlCommon.cpp           | 198 +++++-
 .../ONNXToKrnl/ONNXToKrnlCommon.hpp           |  57 +-
 src/Conversion/ONNXToKrnl/RNN/LSTM.cpp        |  19 +-
 src/Conversion/ONNXToKrnl/Tensor/Concat.cpp   |   6 +-
 src/Conversion/ONNXToKrnl/Tensor/Identity.cpp |   1 +
 src/Conversion/ONNXToKrnl/Tensor/Pad.cpp      |   4 +-
 .../ONNXToKrnl/Tensor/PadConstantValuePad.cpp |   4 +-
 src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp  |   3 +-
 src/Conversion/ONNXToKrnl/Tensor/Split.cpp    |   3 +-
 src/Conversion/ONNXToKrnl/Tensor/Squeeze.cpp  |   3 +-
 .../ONNXToKrnl/Tensor/Transpose.cpp           |   6 +-
 .../ONNXToKrnl/Tensor/Unsqueeze.cpp           |   3 +-
 src/MainUtils.cpp                             |   1 +
 src/Transform/BundleMemoryPools.cpp           |   2 -
 src/Transform/EnableMemoryPool.cpp            |   4 +-
 test/mlir/krnl/constant.mlir                  |   4 +-
 test/mlir/krnl/memory_pool.mlir               |   2 +-
 test/mlir/krnl/reshape.mlir                   |   4 +-
 test/mlir/onnx/onnx_bundle_memory_pool.mlir   |   4 +-
 test/mlir/onnx/onnx_enable_memory_pool.mlir   |  30 +-
 test/mlir/onnx/onnx_lowering.mlir             | 601 +-----------------
 test/mlir/onnx/onnx_lowering_lstm.mlir        | 263 ++++++++
 test/mlir/onnx/onnx_lowering_pooling.mlir     | 121 ++++
 test/mlir/onnx/onnx_lowering_reductions.mlir  |  93 +++
 test/mlir/onnx/onnx_lowering_split.mlir       |  85 +++
 test/mlir/onnx/onnx_lowering_squeeze.mlir     |  29 +
 .../mlir/onnx/onnx_lowering_with_dealloc.mlir | 100 +--
 test/mlir/onnx/onnx_structure.mlir            |   2 +-
 37 files changed, 1032 insertions(+), 725 deletions(-)
 create mode 100644 test/mlir/onnx/onnx_lowering_lstm.mlir
 create mode 100644 test/mlir/onnx/onnx_lowering_pooling.mlir
 create mode 100644 test/mlir/onnx/onnx_lowering_reductions.mlir
 create mode 100644 test/mlir/onnx/onnx_lowering_split.mlir
 create mode 100644 test/mlir/onnx/onnx_lowering_squeeze.mlir
diff --git a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
index 2571458..38f860c 100644
--- a/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
+++ b/src/Conversion/ONNXToKrnl/ConvertONNXToKrnl.cpp
@@ -1,5 +1,4 @@
-//====------ ConvertONNXToKrnl.cpp - ONNX dialects to Krnl lowering
-//--------===//
+//====------ ConvertONNXToKrnl.cpp - ONNX dialects to Krnl lowering -------===//
 //
 // Copyright 2019 The IBM Research Authors.
 //
@@ -34,6 +33,38 @@ public:
   }
 };
 
+//===----------------------------------------------------------------------===//
+// FuncOp lowering to Function with init and main blocks.
+//===----------------------------------------------------------------------===//
+
+struct FuncOpSignatureConversion : public OpConversionPattern<FuncOp> {
+  FuncOpSignatureConversion(MLIRContext *ctx, TypeConverter &converter)
+      : OpConversionPattern(converter, ctx) {}
+
+  /// Hook for derived classes to implement combined matching and rewriting.
+  LogicalResult matchAndRewrite(FuncOp funcOp, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    FunctionType type = funcOp.getType();
+
+    // Convert the original function types.
+    TypeConverter::SignatureConversion result(type.getNumInputs());
+    SmallVector<Type, 1> newResults;
+    if (failed(typeConverter->convertSignatureArgs(type.getInputs(), result)) ||
+        failed(typeConverter->convertTypes(type.getResults(), newResults)) ||
+        failed(rewriter.convertRegionTypes(
+            &funcOp.getBody(), *typeConverter, &result)))
+      return failure();
+
+    // Update the function signature in-place.
+    rewriter.updateRootInPlace(funcOp, [&] {
+      funcOp.setType(FunctionType::get(
+          result.getConvertedTypes(), newResults, funcOp.getContext()));
+    });
+    addInitBlock(rewriter, funcOp.getLoc(), funcOp);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Frontend to Krnl Dialect lowering pass
 //===----------------------------------------------------------------------===//
@@ -49,6 +80,10 @@ struct FrontendToKrnlLoweringPass
 void FrontendToKrnlLoweringPass::runOnOperation() {
   ModuleOp module = getOperation();
 
+  // Create an entry for this module
+  initMap.insert(std::pair<ModuleOp, std::unique_ptr<FunctionToInitStates>>(
+      module, std::make_unique<FunctionToInitStates>()));
+
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
   ConversionTarget target(getContext());
@@ -77,12 +112,6 @@ void FrontendToKrnlLoweringPass::runOnOperation() {
     return tensor_to_memref_converter.isSignatureLegal(op.getType());
   });
 
-  // Type conversion for function signatures.
-  // Call MLIR FuncOp signature conversion when result type is
-  // a ranked tensor.
-  populateFuncOpTypeConversionPattern(
-      patterns, &getContext(), tensor_to_memref_converter);
-
   // Frontend operation lowering.
   // Math
   populateLoweringONNXElementwiseOpPattern(patterns, &getContext());
@@ -109,12 +138,16 @@ void FrontendToKrnlLoweringPass::runOnOperation() {
   populateLoweringONNXLSTMOpPattern(patterns, &getContext());
   // Entry point
   patterns.insert<ONNXEntryPointLowering>(&getContext());
+  patterns.insert<FuncOpSignatureConversion>(
+      &getContext(), tensor_to_memref_converter);
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
   // operations were not converted successfully.
   if (failed(applyPartialConversion(module, target, patterns)))
     signalPassFailure();
+
+  initMap.erase(module);
 }
 
 std::unique_ptr<Pass> mlir::createLowerToKrnlPass() {
diff --git a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
index 3bc2222..33391c0 100644
--- a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
@@ -518,10 +518,11 @@ struct ONNXElementwiseUnaryOpLowering : public ConversionPattern {
     bool insertDealloc = checkInsertDealloc(op);
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
-    else
       alloc =
-          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, {X});
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
+    else
+      alloc = insertAllocAndDealloc(
+          memRefType, loc, rewriter, insertDealloc, op, {X});
 
     SmallVector<Value, 4> loopIVs;
     if (!hasAllScalarValues(operands)) {
@@ -574,10 +575,11 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
     // comes from.
     // TODO: can the dimension of the result differ after optimizations?
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, operands);
+          memRefType, loc, rewriter, insertDealloc, op, operands);
 
     SmallVector<Value, 4> loopIVs;
     std::map<int, std::map<int, Value>> broadcastedDimInfo;
diff --git a/src/Conversion/ONNXToKrnl/Math/Gemm.cpp b/src/Conversion/ONNXToKrnl/Math/Gemm.cpp
index cce0529..1d3fbcf 100644
--- a/src/Conversion/ONNXToKrnl/Math/Gemm.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Gemm.cpp
@@ -46,7 +46,8 @@ struct ONNXGemmOpLowering : public ConversionPattern {
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else {
       auto memRefShape = memRefType.getShape();
       SmallVector<Value, 2> allocOperands;
diff --git a/src/Conversion/ONNXToKrnl/Math/MatMul.cpp b/src/Conversion/ONNXToKrnl/Math/MatMul.cpp
index 69f0006..702fd4e 100644
--- a/src/Conversion/ONNXToKrnl/Math/MatMul.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/MatMul.cpp
@@ -43,8 +43,16 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else {
+      PatternRewriter::InsertionGuard insertGuard(rewriter);
+      FuncOp function = getContainingFunction(op);
+      bool functionLevelAlloc = (op->getParentOp() == function);
+      bool canMove = checkAllocMovable(function, functionLevelAlloc, {A, B});
+      if (canMove)
+        rewriter.setInsertionPoint(getInitInsertionPoint(function));
+
       SmallVector<Value, 4> allocOperands;
       if (AShape.size() >= 2 && BShape.size() >= 2) {
         // Both arguments are N-D, N >= 2
@@ -108,6 +116,9 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
       }
 
       alloc = rewriter.create<AllocOp>(loc, memRefType, allocOperands);
+
+      if (canMove)
+        markOperandInInitBlock(function, alloc);
     }
 
     if (AShape.size() >= 2 || BShape.size() >= 2) {
diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
index 2a66c39..a029c34 100644
--- a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
@@ -159,8 +159,8 @@ struct ONNXReductionOpLowering : public ConversionPattern {
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
     if (hasAllConstantDimensions(memRefOutType)) {
-      alloc =
-          insertAllocAndDealloc(memRefOutType, loc, rewriter, insertDealloc);
+      alloc = insertAllocAndDealloc(
+          memRefOutType, loc, rewriter, insertDealloc, op);
     } else {
       SmallVector<Value, 2> allocOperands;
       for (decltype(outRank) i = 0; i < outRank; ++i) {
diff --git a/src/Conversion/ONNXToKrnl/Math/Softmax.cpp b/src/Conversion/ONNXToKrnl/Math/Softmax.cpp
index 44826e2..4e8986c 100644
--- a/src/Conversion/ONNXToKrnl/Math/Softmax.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Softmax.cpp
@@ -36,18 +36,21 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, input);
+          memRefType, loc, rewriter, insertDealloc, op, input);
 
     // Shape of the result
     auto memRefShape = memRefType.getShape();
 
     // Insert allocations and deallocations for sum and max.
     MemRefType scalarMemRefType = MemRefType::get({}, elementType, {}, 0);
-    Value sumOp = insertAllocAndDealloc(scalarMemRefType, loc, rewriter, true);
-    Value maxOp = insertAllocAndDealloc(scalarMemRefType, loc, rewriter, true);
+    Value sumOp =
+        insertAllocAndDealloc(scalarMemRefType, loc, rewriter, true, op);
+    Value maxOp =
+        insertAllocAndDealloc(scalarMemRefType, loc, rewriter, true, op);
     Value zero = emitConstantOp(rewriter, loc, elementType, 0);
     Value negInfinity = rewriter.create<ConstantOp>(loc,
         FloatAttr::get(elementType, -std::numeric_limits<float>::infinity()));
diff --git a/src/Conversion/ONNXToKrnl/NN/Conv.cpp b/src/Conversion/ONNXToKrnl/NN/Conv.cpp
index 607f43b..11fa940 100644
--- a/src/Conversion/ONNXToKrnl/NN/Conv.cpp
+++ b/src/Conversion/ONNXToKrnl/NN/Conv.cpp
@@ -36,10 +36,11 @@ struct ONNXConvOpLowering : public ConversionPattern {
     bool hasBias = !biasOperand.getType().isa<NoneType>();
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, {inputOperand});
+          memRefType, loc, rewriter, insertDealloc, op, {inputOperand});
 
     // R = Conv(D, K)
     //
diff --git a/src/Conversion/ONNXToKrnl/NN/Normalization.cpp b/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
index e160bea..a5959e7 100644
--- a/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
+++ b/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
@@ -42,10 +42,11 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
     bool insertDealloc = checkInsertDealloc(op);
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, {operand});
+          memRefType, loc, rewriter, insertDealloc, op, {operand});
 
     // Operand's dimensions can be in the form of NxCxD1xD2x...xDn or N.
     // In case of N, C is assumed to be 1.
diff --git a/src/Conversion/ONNXToKrnl/NN/Pooling.cpp b/src/Conversion/ONNXToKrnl/NN/Pooling.cpp
index d40de6f..ecc2286 100644
--- a/src/Conversion/ONNXToKrnl/NN/Pooling.cpp
+++ b/src/Conversion/ONNXToKrnl/NN/Pooling.cpp
@@ -235,7 +235,8 @@ struct ONNXPoolOpLowering : public ConversionPattern {
     bool insertDealloc = checkInsertDealloc(op);
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else {
       alloc = insertAllocAndDeallocForPooling(rewriter, loc, insertDealloc,
           memRefType, inputOperand, kernelShape, pads, strides, dilations,
diff --git a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp
index c57d0a4..607e6dd 100644
--- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp
+++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.cpp
@@ -11,6 +11,8 @@
 
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 
+std::map<ModuleOp, std::unique_ptr<FunctionToInitStates>> initMap;
+
 /// Check is all dimensions are known at compile time.
 bool hasAllConstantDimensions(MemRefType type) {
   auto memRefShape = type.getShape();
@@ -43,11 +45,151 @@ MemRefType convertToMemRefType(Type type) {
   return memRefType;
 }
 
+/// Retrieve function which contains the current operation.
+FuncOp getContainingFunction(Operation *op) {
+  Operation *parentFuncOp = op->getParentOp();
+
+  // While parent is not a FuncOp and its cast to a FuncOp is null.
+  while (!llvm::dyn_cast_or_null<FuncOp>(parentFuncOp))
+    parentFuncOp = parentFuncOp->getParentOp();
+
+  return cast<FuncOp>(parentFuncOp);
+}
+
+void addInitBlock(PatternRewriter &rewriter, Location loc, FuncOp function) {
+  // If this is the first time we encounter an operation in this
+  // function, we create an entry inside the initMap and split the
+  // function body into an init block and a main block.
+  //
+  // function func_name() {
+  //    ... init block ...
+  //    br ^bb1
+  //  ^bb1:  // pred: ^bb0
+  //    ... main block ...
+  //    return
+  // }
+  //
+  // Note: the block ^bb0 being the first block has its label omitted.
+  //
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  if (initStates->count(function) == 0) {
+    initStates->insert(
+        std::pair<FuncOp, std::unique_ptr<ONNXOperandsInitState>>(
+            function, std::make_unique<ONNXOperandsInitState>()));
+    std::unique_ptr<ONNXOperandsInitState> &initState =
+        initStates->at(function);
+
+    // All input arguments are considered as part of the initialization block
+    // so add them to the operandsInInitBlock set.
+    Block *functionBlock = &function.front();
+    for (auto arg : functionBlock->getArguments())
+      initState->operandsInInitBlock.insert(arg);
+
+    PatternRewriter::InsertionGuard insertGuard(rewriter);
+    rewriter.setInsertionPointToStart(functionBlock);
+
+    initState->initBlock = rewriter.getInsertionBlock();
+    auto currentPoint = rewriter.getInsertionPoint();
+    initState->mainBlock =
+        rewriter.splitBlock(initState->initBlock, currentPoint);
+
+    rewriter.setInsertionPointToEnd(initState->initBlock);
+
+    // Insert a branch operation from initBlock to mainBlock. This
+    // ensures the final code contains legal blocks.
+    initState->branchInit =
+        rewriter.create<BranchOp>(loc, initState->mainBlock);
+
+    // Set insertion point to start of mainBlock.
+    rewriter.setInsertionPointToStart(initState->mainBlock);
+  }
+}
+
+bool containingFunctionHasInitBlock(Operation *op) {
+  FuncOp function = getContainingFunction(op);
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  return initStates->count(function) > 0;
+}
+
+Block *getInitBlock(FuncOp function) {
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  assert(initStates->count(function) > 0 &&
+         "Initialization state not defined for this function.");
+  return initStates->at(function)->initBlock;
+}
+
+Block *getMainBlock(FuncOp function) {
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  assert(initStates->count(function) > 0 &&
+         "Initialization state not defined for this function.");
+  return initStates->at(function)->mainBlock;
+}
+
+BranchOp getInitInsertionPoint(FuncOp function) {
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  assert(initStates->count(function) > 0 &&
+         "Initialization state not defined for this function.");
+  return initStates->at(function)->branchInit;
+}
+
+/// Check if all operands used for allocating the size of the result are
+/// in the initialization block (i.e. initBlock).
+bool checkAllocMovable(
+    FuncOp function, bool functionLevelAlloc, ArrayRef<Value> operands) {
+  // If no initialization block exists then alloc cannot be moved.
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  if (initStates->count(function) == 0)
+    return false;
+
+  // If the alloc is not function level alloc then it cannot be moved.
+  if (!functionLevelAlloc)
+    return false;
+
+  bool allInitOrArg = true;
+  for (int i = 0; i < operands.size(); i++) {
+    if (initStates->at(function)->operandsInInitBlock.count(operands[i]) == 0)
+      allInitOrArg = false;
+  }
+
+  return allInitOrArg;
+}
+
+/// Add operand to list of operands in the init block.
+void markOperandInInitBlock(FuncOp function, Value operand) {
+  // Check if function is valid. At this point it has to be.
+  assert(function && "Attempt to add operand when function is null.");
+  ModuleOp module = cast<ModuleOp>(function.getParentOp());
+  std::unique_ptr<FunctionToInitStates> &initStates = initMap.at(module);
+  // A valid function must have an initialization state.
+  assert(initStates->count(function) > 0 &&
+         "Initialization state not defined for this function.");
+  initStates->at(function)->operandsInInitBlock.insert(operand);
+}
+
 /// Insert an allocation and deallocation for the given MemRefType.
-Value insertAllocAndDealloc(MemRefType type, Location loc,
-    PatternRewriter &rewriter, bool insertDealloc, ArrayRef<Value> operands,
-    int64_t alignment) {
+Value insertAllocAndDeallocWithFunction(MemRefType type, Location loc,
+    PatternRewriter &rewriter, bool insertDealloc, FuncOp function,
+    bool functionLevelAlloc, ArrayRef<Value> operands, int64_t alignment) {
   // Put together alloc operands for any dynamic dimensions of the memref.
+  // Save insertion point in case we need to change it to the initBlock.
+  PatternRewriter::InsertionGuard insertGuard(rewriter);
+
+  // Check if all operands of the alloc are in the init region or are input
+  // arguments. If some of them are not or there is no init block, this
+  // variable will be false.
+  bool canMove = checkAllocMovable(function, functionLevelAlloc, operands);
+
+  // If a legal move to the init block is possible, set insertion point
+  // at the end of the initialization block just before the branch instruction.
+  if (canMove)
+    rewriter.setInsertionPoint(getInitInsertionPoint(function));
+
   AllocOp alloc;
   if (!operands.empty()) {
     auto memRefShape = type.getShape();
@@ -97,6 +239,11 @@ Value insertAllocAndDealloc(MemRefType type, Location loc,
     } else {
       alloc = rewriter.create<AllocOp>(loc, type, allocOperands);
     }
+
+    // If the alloc was emitted inside the initializatin block then mark add
+    // it to the set of values emitted in the initialization block.
+    if (canMove)
+      markOperandInInitBlock(function, alloc.getResult());
   } else {
     // Set alignment attribute. Default value is `-1`, which does not set
     // alignment.
@@ -113,17 +260,52 @@ Value insertAllocAndDealloc(MemRefType type, Location loc,
   // Make sure to allocate at the beginning of the block if
   // all dimensions are known.
   auto *parentBlock = alloc.getOperation()->getBlock();
-  if (hasAllConstantDimensions(type))
-    alloc.getOperation()->moveBefore(&parentBlock->front());
+  if (hasAllConstantDimensions(type)) {
+    // Check if this move is a move to the init block or to the top of the
+    // function without an init block. For the case in which all dimensions
+    // are constant, the `canMove` variable will be false if there is no
+    // init block.
+    if (canMove) {
+      // The alloc was emitted in the init block already so just record
+      // that this value is not available in the init block.
+      alloc.getOperation()->moveBefore(&getInitBlock(function)->front());
+      markOperandInInitBlock(function, alloc.getResult());
+    } else {
+      // No init block exists in this case so just move it as before.
+      alloc.getOperation()->moveBefore(&parentBlock->front());
+    }
+  }
 
   if (insertDealloc) {
     auto dealloc = rewriter.create<DeallocOp>(loc, alloc);
-    dealloc.getOperation()->moveBefore(&parentBlock->back());
+    // Move dealloc to the end of the main block if such a block exists.
+    if (canMove) {
+      Block *mainBlock = getMainBlock(function);
+      dealloc.getOperation()->moveBefore(&mainBlock->back());
+    } else {
+      // If no main block exists, move to parent block.
+      dealloc.getOperation()->moveBefore(&parentBlock->back());
+    }
   }
 
   return alloc;
 }
 
+/// Insert an allocation and deallocation for the given MemRefType.
+Value insertAllocAndDealloc(MemRefType type, Location loc,
+    PatternRewriter &rewriter, bool insertDealloc, Operation *op,
+    ArrayRef<Value> operands, int64_t alignment) {
+  FuncOp function = getContainingFunction(op);
+
+  bool functionLevelAlloc = (op->getParentOp() == function);
+  if (!functionLevelAlloc) {
+    printf("This is not a function level alloc!\n");
+  }
+
+  return insertAllocAndDeallocWithFunction(type, loc, rewriter, insertDealloc,
+      function, functionLevelAlloc, operands, alignment);
+}
+
 // Determine if current function returns the result value of the
 // current op being lowered. If it does then dealloc should not be
 // inserted.
@@ -463,10 +645,10 @@ int64_t ArrayAttrIntVal(ArrayAttr a, int i) {
 }
 
 bool checkOpResultIsUsedByGetRef(AllocOp *allocOp) {
-  auto parentBlock = allocOp->getOperation()->getBlock();
+  FuncOp function = getContainingFunction(allocOp->getOperation());
 
   bool opIsUsedInGetRef = false;
-  parentBlock->walk([&opIsUsedInGetRef, allocOp](KrnlGetRefOp op) {
+  function.walk([&opIsUsedInGetRef, allocOp](KrnlGetRefOp op) {
     auto result = allocOp->getResult();
     for (const auto &operand : op.getOperands())
       if (operand == result)
diff --git a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
index 6b6660c..caca3d8 100644
--- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
+++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
@@ -19,7 +19,9 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
 
 #include "src/Dialect/Krnl/KrnlHelper.hpp"
 #include "src/Dialect/Krnl/KrnlOps.hpp"
@@ -29,6 +31,37 @@
 
 using namespace mlir;
 
+//===----------------------------------------------------------------------===//
+// Insertion point for initialization instructions and the blocks used for
+// inserting the initialization and main code. These blocks will disappear
+// when the first canonicalization is performed because the init block
+// unconditionally branches into the second block. These blocks exist only for
+// the purpose of this optimization.
+// The support happens on a per function basis.
+//===----------------------------------------------------------------------===//
+
+typedef struct ONNXOperandsInitState {
+  Block *initBlock;
+  Block *mainBlock;
+  BranchOp branchInit;
+  llvm::SetVector<Value> operandsInInitBlock;
+} ONNXOperandsInitState;
+
+typedef std::map<FuncOp, std::unique_ptr<ONNXOperandsInitState>>
+    FunctionToInitStates;
+
+// This map is used by the FrontendToKrnlLoweringPass pass to keep track of the
+// allocations emitted in the initialization block for each function of a given
+// module. A translation unit can consist of several modules, each with several
+// functions hence the structure shown below.
+// This data structure enables the emission of dyanmic `alloc` instructions
+// in the initialization block of a function if all the other operands the
+// computation of its parameters depends on are also present in that function's
+// initialization block.
+// This data structure is live only during the execution of the frontend
+// lowering to Krnl dialect pass (FrontendToKrnlLoweringPass).
+extern std::map<ModuleOp, std::unique_ptr<FunctionToInitStates>> initMap;
+
 //===----------------------------------------------------------------------===//
 // Common functions used when lowering the ONNX frontend dialect to KRNL.
 //===----------------------------------------------------------------------===//
@@ -44,9 +77,14 @@ MemRefType convertToMemRefType(Type type);
 
 /// Insert an allocation and deallocation for the given MemRefType.
 Value insertAllocAndDealloc(MemRefType type, Location loc,
-    PatternRewriter &rewriter, bool insertDealloc,
+    PatternRewriter &rewriter, bool insertDealloc, Operation *op,
     ArrayRef<Value> operands = {}, int64_t alignment = -1);
 
+Value insertAllocAndDeallocWithFunction(MemRefType type, Location loc,
+    PatternRewriter &rewriter, bool insertDealloc, FuncOp function,
+    bool functionLevelAlloc, ArrayRef<Value> operands = {},
+    int64_t alignment = -1);
+
 // Determine if current function returns the result value of the
 // current op being lowered. If it does then dealloc should not be
 // inserted.
@@ -246,3 +284,20 @@ void populateLoweringONNXSplitOpPattern(
 bool checkOpResultIsUsedByGetRef(AllocOp *allocOp);
 
 int64_t getMemRefSizeInBytes(Value val);
+
+FuncOp getContainingFunction(Operation *op);
+
+void addInitBlock(PatternRewriter &rewriter, Location loc, FuncOp op);
+
+bool containingFunctionHasInitBlock(Operation *op);
+
+Block *getInitBlock(FuncOp function);
+
+Block *getMainBlock(FuncOp function);
+
+BranchOp getInitInsertionPoint(FuncOp function);
+
+bool checkAllocMovable(
+    FuncOp function, bool functionLevelAlloc, ArrayRef<Value> operands);
+
+void markOperandInInitBlock(FuncOp function, Value operand);
diff --git a/src/Conversion/ONNXToKrnl/RNN/LSTM.cpp b/src/Conversion/ONNXToKrnl/RNN/LSTM.cpp
index 808de78..1fb7755 100644
--- a/src/Conversion/ONNXToKrnl/RNN/LSTM.cpp
+++ b/src/Conversion/ONNXToKrnl/RNN/LSTM.cpp
@@ -161,13 +161,14 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
     ConversionPatternRewriter &rewriter, Location loc, ONNXLSTMOp *op,
     typename ONNXLSTMOp::Adaptor operandAdaptor) {
   LstmState state;
+  FuncOp function = cast<FuncOp>(op->getParentOp());
 
   // Insert allocation and deallocation for the results of this operation.
   if (!isNoneType(op->Y())) {
     auto yMemRefType = convertToMemRefType(op->Y().getType());
     if (hasAllConstantDimensions(yMemRefType))
-      state.allH = insertAllocAndDealloc(yMemRefType, loc, rewriter,
-          checkInsertDealloc(op->getOperation(), 0));
+      state.allH = insertAllocAndDeallocWithFunction(yMemRefType, loc, rewriter,
+          checkInsertDealloc(op->getOperation(), 0), function, true);
     else {
       llvm_unreachable("Unsupported dynamic dimensions.");
     }
@@ -179,8 +180,8 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
   if (!isNoneType(op->Y_h())) {
     auto yhMemRefType = convertToMemRefType(op->Y_h().getType());
     if (hasAllConstantDimensions(yhMemRefType))
-      state.ht = insertAllocAndDealloc(yhMemRefType, loc, rewriter,
-          checkInsertDealloc(op->getOperation(), 1));
+      state.ht = insertAllocAndDeallocWithFunction(yhMemRefType, loc, rewriter,
+          checkInsertDealloc(op->getOperation(), 1), function, true);
     else
       llvm_unreachable("Unsupported dynamic dimensions.");
   } else {
@@ -188,15 +189,16 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
         {dimAt(operandAdaptor.W(), 0), dimAt(operandAdaptor.X(), 1),
             dimAt(operandAdaptor.R(), 2)},
         operandAdaptor.X().getType().cast<ShapedType>().getElementType());
-    state.ht = insertAllocAndDealloc(yhMemRefType, loc, rewriter, true);
+    state.ht = insertAllocAndDeallocWithFunction(
+        yhMemRefType, loc, rewriter, true, function, true);
   }
 
   // Y_c :: [num_directions, batch_size, hidden_size]
   if (!isNoneType(op->Y_c())) {
     auto ycMemRefType = convertToMemRefType(op->Y_c().getType());
     if (hasAllConstantDimensions(ycMemRefType))
-      state.ct = insertAllocAndDealloc(ycMemRefType, loc, rewriter,
-          checkInsertDealloc(op->getOperation(), 2));
+      state.ct = insertAllocAndDeallocWithFunction(ycMemRefType, loc, rewriter,
+          checkInsertDealloc(op->getOperation(), 2), function, true);
     else
       llvm_unreachable("Unsupported dynamic dimensions.");
   } else {
@@ -204,7 +206,8 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
         {dimAt(operandAdaptor.W(), 0), dimAt(operandAdaptor.X(), 1),
             dimAt(operandAdaptor.R(), 2)},
         operandAdaptor.X().getType().cast<ShapedType>().getElementType());
-    state.ct = insertAllocAndDealloc(ycMemRefType, loc, rewriter, true);
+    state.ct = insertAllocAndDeallocWithFunction(
+        ycMemRefType, loc, rewriter, true, function, true);
   }
 
   // Initialize ht and ct.
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Concat.cpp b/src/Conversion/ONNXToKrnl/Tensor/Concat.cpp
index c335d82..1bef186 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Concat.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Concat.cpp
@@ -20,6 +20,7 @@ struct ONNXConcatOpLowering : public ConversionPattern {
       ConversionPatternRewriter &rewriter) const final {
     // Gather info.
     auto loc = op->getLoc();
+
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
     ONNXConcatOp concatOp = llvm::dyn_cast<ONNXConcatOp>(op);
@@ -33,10 +34,11 @@ struct ONNXConcatOpLowering : public ConversionPattern {
     assert((axis >= 0 && axis < rank) && "Concat axis out of bounds");
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, {resultOperand});
+          memRefType, loc, rewriter, insertDealloc, op, {resultOperand});
 
     // Creates loops, one for each input.
     int writeOffset = 0;
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Identity.cpp b/src/Conversion/ONNXToKrnl/Tensor/Identity.cpp
index 3f0b305..e726e2a 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Identity.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Identity.cpp
@@ -18,6 +18,7 @@ struct ONNXIdentityOpLowering : public ConversionPattern {
 
   LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
     ONNXIdentityOpAdaptor operandAdaptor(operands);
     rewriter.replaceOp(op, operandAdaptor.input());
     return success();
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Pad.cpp b/src/Conversion/ONNXToKrnl/Tensor/Pad.cpp
index 2f34b87..705a77d 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Pad.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Pad.cpp
@@ -40,11 +40,13 @@ struct ONNXPadOpLowering : public ConversionPattern {
       return emitError(loc, "Pad: unknown pads");
 
     auto memRefType = convertToMemRefType(tensorType);
+
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       return emitError(loc, "unexpected output has non-Constant shape");
 
diff --git a/src/Conversion/ONNXToKrnl/Tensor/PadConstantValuePad.cpp b/src/Conversion/ONNXToKrnl/Tensor/PadConstantValuePad.cpp
index ebc9195..6498f7e 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/PadConstantValuePad.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/PadConstantValuePad.cpp
@@ -32,11 +32,13 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
 
     // Insert an allocation and deallocation for the result of this operation.
     auto memRefType = convertToMemRefType(tensorType);
+
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       return emitError(loc, "unexpected output has non-Constant shape");
 
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp b/src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp
index d7032a2..52c2db6 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Reshape.cpp
@@ -46,7 +46,8 @@ struct ONNXReshapeOpLowering : public ConversionPattern {
 
     bool insertDealloc = checkInsertDealloc(op);
     if (hasAllConstantDimensions(memRefType)) {
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     } else {
       // If a dimension is zero, the actual dimension value is taken from the
       // input tensor.
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Split.cpp b/src/Conversion/ONNXToKrnl/Tensor/Split.cpp
index 68e1ba3..3002668 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Split.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Split.cpp
@@ -40,7 +40,8 @@ struct ONNXSplitOpLowering : public ConversionPattern {
       auto memRefType = convertToMemRefType(splitOp.outputs()[i].getType());
 
       if (hasAllConstantDimensions(memRefType))
-        alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+        alloc =
+            insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
       else {
         SmallVector<Value, 4> allocOperands;
         auto shape = memRefType.getShape();
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Squeeze.cpp b/src/Conversion/ONNXToKrnl/Tensor/Squeeze.cpp
index 87ca2fe..78b773a 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Squeeze.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Squeeze.cpp
@@ -39,7 +39,8 @@ struct ONNXSqueezeOpLowering : public ConversionPattern {
     Value alloc, tensorSize;
     bool insertDealloc = checkInsertDealloc(op);
     if (hasAllConstantDimensions(memRefType)) {
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
       auto tensorSizeInBytes = elementSizeInBytes;
       for (int i = 0; i < memRefShape.size(); ++i) {
         tensorSizeInBytes *= memRefShape[i];
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Transpose.cpp b/src/Conversion/ONNXToKrnl/Tensor/Transpose.cpp
index d912f9e..a4d7e97 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Transpose.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Transpose.cpp
@@ -22,15 +22,17 @@ struct ONNXTransposeOpLowering : public ConversionPattern {
     auto loc = op->getLoc();
     // Insert an allocation and deallocation for the result of this operation.
     auto memRefType = convertToMemRefType(*op->result_type_begin());
+
     Value alloc;
     bool insertDealloc = checkInsertDealloc(op);
     Value data = operandAdaptor.data();
 
     if (hasAllConstantDimensions(memRefType))
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
     else
       alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, {data});
+          memRefType, loc, rewriter, insertDealloc, op, {data});
 
     // Number of loops
     auto memRefShape = memRefType.getShape();
diff --git a/src/Conversion/ONNXToKrnl/Tensor/Unsqueeze.cpp b/src/Conversion/ONNXToKrnl/Tensor/Unsqueeze.cpp
index 254ddcf..c60654d 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/Unsqueeze.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/Unsqueeze.cpp
@@ -44,7 +44,8 @@ struct ONNXUnsqueezeOpLowering : public ConversionPattern {
     bool insertDealloc = checkInsertDealloc(op);
     auto memRefShape = memRefType.getShape();
     if (hasAllConstantDimensions(memRefType)) {
-      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
+      alloc =
+          insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc, op);
       for (int i = 0; i < memRefShape.size(); ++i) {
         Value dimVal = emitConstantOp(
             rewriter, loc, rewriter.getIntegerType(64), memRefShape[i]);
diff --git a/src/MainUtils.cpp b/src/MainUtils.cpp
index 0a43d1d..7eb7ebb 100644
--- a/src/MainUtils.cpp
+++ b/src/MainUtils.cpp
@@ -378,6 +378,7 @@ void addONNXToMLIRPasses(mlir::PassManager &pm) {
 
 void addONNXToKrnlPasses(mlir::PassManager &pm) {
   pm.addPass(mlir::createLowerToKrnlPass());
+  pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createPackKrnlGlobalConstantsPass());
   // An additional pass of canonicalization is helpful because lowering
   // from ONNX dialect to Standard dialect exposes additional canonicalization
diff --git a/src/Transform/BundleMemoryPools.cpp b/src/Transform/BundleMemoryPools.cpp
index f8e1ddc..edfeb0a 100644
--- a/src/Transform/BundleMemoryPools.cpp
+++ b/src/Transform/BundleMemoryPools.cpp
@@ -87,8 +87,6 @@ public:
 
     // Get a KrnlGetRefOp which does not use the current alloc.
     if (KrnlGetRefOp unbundledGetRef = getUnbundledGetRef(&allocOp)) {
-      unbundledGetRef.dump();
-
       // Current memory pool size is the offset for the newly bundled
       // internal MemRef. Emit the offset as a constant.
       auto offset = rewriter.create<ConstantOp>(
diff --git a/src/Transform/EnableMemoryPool.cpp b/src/Transform/EnableMemoryPool.cpp
index c41293f..3ffa8c5 100644
--- a/src/Transform/EnableMemoryPool.cpp
+++ b/src/Transform/EnableMemoryPool.cpp
@@ -24,10 +24,10 @@ using namespace mlir;
 namespace {
 
 bool checkOpResultIsReturned(AllocOp *allocOp) {
-  auto parentBlock = allocOp->getOperation()->getBlock();
+  FuncOp function = getContainingFunction(allocOp->getOperation());
 
   bool opIsReturned = false;
-  parentBlock->walk([&opIsReturned, allocOp](ReturnOp op) {
+  function.walk([&opIsReturned, allocOp](ReturnOp op) {
     auto result = allocOp->getResult();
     for (const auto &operand : op.getOperands())
       if (operand == result)
diff --git a/test/mlir/krnl/constant.mlir b/test/mlir/krnl/constant.mlir
index 8f71c7c..acf2b51 100644
--- a/test/mlir/krnl/constant.mlir
+++ b/test/mlir/krnl/constant.mlir
@@ -1,6 +1,4 @@
-// RUN: onnx-mlir-opt --shape-inference --lower-frontend --lower-krnl --lower-all-llvm %s -split-input-file | FileCheck %s
-
-// -----
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend --lower-krnl --lower-all-llvm %s | FileCheck %s
 
 func @test_constant(%arg0 : tensor<1xf32>) -> tensor<*xf32> {
   %0 = "onnx.Constant"() {value = dense<[[0.0, 0.0], [1.0, 1.1], [2.0, 2.1]]> : tensor<3x2xf32>} : () -> tensor<*xf32>
diff --git a/test/mlir/krnl/memory_pool.mlir b/test/mlir/krnl/memory_pool.mlir
index 49fad5c..d013e29 100644
--- a/test/mlir/krnl/memory_pool.mlir
+++ b/test/mlir/krnl/memory_pool.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --shape-inference --lower-frontend --enable-memory-pool --lower-krnl --lower-all-llvm %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend --canonicalize --enable-memory-pool --lower-krnl --lower-all-llvm %s | FileCheck %s
 
 func @test_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
   %0 = "onnx.Add"(%arg0, %arg0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
diff --git a/test/mlir/krnl/reshape.mlir b/test/mlir/krnl/reshape.mlir
index dc105d2..9389818 100644
--- a/test/mlir/krnl/reshape.mlir
+++ b/test/mlir/krnl/reshape.mlir
@@ -1,6 +1,4 @@
-// RUN: onnx-mlir-opt --shape-inference --lower-frontend --lower-krnl --lower-all-llvm %s -split-input-file | FileCheck %s
-
-// -----
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend --lower-krnl --lower-all-llvm %s | FileCheck %s
 
 func @test_reshape(%arg0 : tensor<?x10xf32>, %arg1 : tensor<4xi64>) -> tensor<*xf32> {
   %0 = "onnx.Reshape"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<4xi64>) -> tensor<*xf32>
diff --git a/test/mlir/onnx/onnx_bundle_memory_pool.mlir b/test/mlir/onnx/onnx_bundle_memory_pool.mlir
index f6288a8..6450453 100644
--- a/test/mlir/onnx/onnx_bundle_memory_pool.mlir
+++ b/test/mlir/onnx/onnx_bundle_memory_pool.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --shape-inference --lower-frontend --enable-memory-pool --bundle-memory-pools --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend --canonicalize --enable-memory-pool --bundle-memory-pools --canonicalize %s | FileCheck %s
 
 func @test_bundle_memory_pool(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf32>) -> tensor<10x20xf32> {
   %0 = "onnx.Add"(%arg0, %arg0) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
@@ -10,8 +10,8 @@ func @test_bundle_memory_pool(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf32>
   return %5 : tensor<10x20xf32>
 
   // CHECK-LABEL: test_bundle_memory_pool
-  // CHECK: [[CONST0:%.+]] = constant 0 : i64
   // CHECK: [[CONST00:%.+]] = constant 0.000000e+00 : f32
+  // CHECK: [[CONST0:%.+]] = constant 0 : i64
   // CHECK: [[CONST400:%.+]] = constant 400 : i64
   // CHECK: [[CONST1200:%.+]] = constant 1200 : i64
   // CHECK: [[CONST2000:%.+]] = constant 2000 : i64
diff --git a/test/mlir/onnx/onnx_enable_memory_pool.mlir b/test/mlir/onnx/onnx_enable_memory_pool.mlir
index 62b305c..f18bc14 100644
--- a/test/mlir/onnx/onnx_enable_memory_pool.mlir
+++ b/test/mlir/onnx/onnx_enable_memory_pool.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --shape-inference --lower-frontend --enable-memory-pool %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend --canonicalize --enable-memory-pool %s | FileCheck %s
 
 /// One intermediate value to allocate in the memory pool.
 func @test_enable_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
@@ -13,10 +13,10 @@ func @test_enable_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
   // CHECK: [[GETREF:%.+]] = "krnl.getref"([[MEMPOOL]], [[CONST0]]) : (memref<400xi8>, i64) -> memref<10x10xf32>
   // CHECK: krnl.define_loops
   // CHECK: krnl.iterate
-  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[symbol(%arg1), symbol(%arg2)] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[symbol(%arg1), symbol(%arg2)] : memref<10x10xf32>
   // CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: affine.store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
+  // CHECK: affine.store [[ADDF1]], [[GETREF]][symbol(%arg1), symbol(%arg2)] : memref<10x10xf32>
   // CHECK: krnl.define_loops
   // CHECK: krnl.iterate
   // CHECK: dealloc [[MEMPOOL]] : memref<400xi8>
@@ -31,8 +31,8 @@ func @test_enable_memory_pool_2(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf3
   return %2 : tensor<10x20xf32>
 
   // CHECK-LABEL: test_enable_memory_pool_2
-  // CHECK: [[CONST0:%.+]] = constant 0 : i64
   // CHECK: [[CONST1:%.+]] = constant 0.000000e+00 : f32
+  // CHECK: [[CONST0:%.+]] = constant 0 : i64
   // CHECK: [[RES:%.+]] = alloc() : memref<10x20xf32>
   // CHECK: [[MEMPOOL0:%.+]] = alloc() : memref<800xi8>
   // CHECK: [[GETREF0:%.+]] = "krnl.getref"([[MEMPOOL0]], [[CONST0]]) : (memref<800xi8>, i64) -> memref<10x20xf32>
@@ -40,24 +40,24 @@ func @test_enable_memory_pool_2(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf3
   // CHECK: [[GETREF1:%.+]] = "krnl.getref"([[MEMPOOL1]], [[CONST0]]) : (memref<400xi8>, i64) -> memref<10x10xf32>
   // CHECK: krnl.define_loops
   // CHECK: krnl.iterate
-  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
-  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[symbol(%arg2), symbol(%arg3)] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %arg0[symbol(%arg2), symbol(%arg3)] : memref<10x10xf32>
   // CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: affine.store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: affine.store [[ADDF1]], [[GETREF1]][symbol(%arg2), symbol(%arg3)] : memref<10x10xf32>
   // CHECK: krnl.define_loops
   // CHECK: krnl.iterate
-  // CHECK: [[LOAD3:%.+]] = affine.load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
-  // CHECK: [[LOAD4:%.+]] = affine.load %arg1[%arg4, %arg3] : memref<10x20xf32>
-  // CHECK: [[LOAD5:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: [[LOAD3:%.+]] = affine.load [[GETREF1]][symbol(%arg2), symbol(%arg4)] : memref<10x10xf32>
+  // CHECK: [[LOAD4:%.+]] = affine.load %arg1[symbol(%arg4), symbol(%arg3)] : memref<10x20xf32>
+  // CHECK: [[LOAD5:%.+]] = affine.load [[GETREF0]][symbol(%arg2), symbol(%arg3)] : memref<10x20xf32>
   // CHECK: [[MULF1:%.+]] = mulf [[LOAD3]], [[LOAD4]] : f32
   // CHECK: [[ADDF2:%.+]] = addf [[LOAD5]], [[MULF1]] : f32
-  // CHECK: affine.store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: affine.store [[ADDF2]], [[GETREF0]][symbol(%arg2), symbol(%arg3)] : memref<10x20xf32>
   // CHECK: krnl.define_loops
   // CHECK: krnl.iterate
-  // CHECK: [[LOAD6:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
-  // CHECK: [[LOAD7:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: [[LOAD6:%.+]] = affine.load [[GETREF0]][symbol(%arg2), symbol(%arg3)] : memref<10x20xf32>
+  // CHECK: [[LOAD7:%.+]] = affine.load %arg1[symbol(%arg2), symbol(%arg3)] : memref<10x20xf32>
   // CHECK: [[ADDF3:%.+]] = addf [[LOAD6]], [[LOAD7]] : f32
-  // CHECK: affine.store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
+  // CHECK: affine.store [[ADDF3]], [[RES]][symbol(%arg2), symbol(%arg3)] : memref<10x20xf32>
   // CHECK: dealloc [[MEMPOOL1]] : memref<400xi8>
   // CHECK: dealloc [[MEMPOOL0]] : memref<800xi8>
   // CHECK: return [[RES]] : memref<10x20xf32>
diff --git a/test/mlir/onnx/onnx_lowering.mlir b/test/mlir/onnx/onnx_lowering.mlir
index 5aa3ef7..f6c6dfa 100644
--- a/test/mlir/onnx/onnx_lowering.mlir
+++ b/test/mlir/onnx/onnx_lowering.mlir
@@ -692,100 +692,6 @@ func @test_add_with_broadcasting(%arg0 : tensor<?xf32>, %arg1 : tensor<?x10xf32>
   // CHECK: }
   // CHECK: return [[RES]] : memref<?x10xf32>
 }
-
-// -----
-
-func @test_reducemax(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
-  %0 ="onnx.ReduceMax"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: test_reducemax
-  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
-  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
-  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
-  // CHECK: [[IDENTITY:%.+]] = constant 0xFF800000 : f32
-  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
-
-  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
-  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
-  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: [[CMP:%.+]] = cmpf "ogt", [[LOAD2]], [[LOAD1]] : f32
-  // CHECK: [[SELECT:%.+]] = select [[CMP]], [[LOAD2]], [[LOAD1]] : f32
-  // CHECK: store [[SELECT]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: }
-  // CHECK: return [[RES]] : memref<3x2xf32>
-}
-
-// -----
-
-func @test_reducemin(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
-  %0 ="onnx.ReduceMin"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: test_reducemin
-  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
-  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
-  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
-  // CHECK: [[IDENTITY:%.+]] = constant 0x7F800000 : f32
-  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
-
-  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
-  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
-  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: [[CMP:%.+]] = cmpf "olt", [[LOAD2]], [[LOAD1]] : f32
-  // CHECK: [[SELECT:%.+]] = select [[CMP]], [[LOAD2]], [[LOAD1]] : f32
-  // CHECK: affine.store [[SELECT]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: }
-  // CHECK: return [[RES]] : memref<3x2xf32>
-}
-
-// -----
-
-func @test_reduceprod(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
-  %0 ="onnx.ReduceProd"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: test_reduceprod
-  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
-  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
-  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
-  // CHECK: [[IDENTITY:%.+]] = constant 1.000000e+00 : f32
-  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
-
-  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
-  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
-  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: [[REDUCE:%.+]] = mulf [[LOAD2]], [[LOAD1]] : f32
-  // CHECK: affine.store [[REDUCE]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: }
-  // CHECK: return [[RES]] : memref<3x2xf32>
-}
-
-// -----
-
-func @test_reducesum(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
-  %0 ="onnx.ReduceSum"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: test_reducesum
-  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
-  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
-  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
-  // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
-  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
-
-  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
-  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
-  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: [[REDUCE:%.+]] = addf [[LOAD2]], [[LOAD1]] : f32
-  // CHECK: affine.store [[REDUCE]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
-  // CHECK: }
-  // CHECK: return [[RES]] : memref<3x2xf32>
-}
   
 // -----
 
@@ -1107,10 +1013,10 @@ func @test_matmul5(%arg0 : tensor<5xf32>, %arg1 : tensor<?x5x10xf32>) -> tensor<
   "std.return"(%0) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_matmul5
-  // CHECK: [[CONSTANT:%.+]] = constant 0.000000e+00 : f32
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg1, [[C0]] : memref<?x5x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[CONSTANT:%.+]] = constant 0.000000e+00 : f32
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_1:%.+]] = dim [[RES]], [[C0_0]] : memref<?x10xf32>
@@ -1139,10 +1045,10 @@ func @test_matmul6(%arg0 : tensor<?x10x5xf32>, %arg1 : tensor<5xf32>) -> tensor<
   "std.return"(%0) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_matmul6
-  // CHECK: [[CONSTANT:%.+]] = constant 0.000000e+00 : f32
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10x5xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[CONSTANT:%.+]] = constant 0.000000e+00 : f32
   // CHECK: [[LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_1:%.+]] = dim [[RES]], [[C0_0]] : memref<?x10xf32>
@@ -1515,506 +1421,3 @@ func @test_concat_1(%arg0 : tensor<5x5x1x32xf32>, %arg1 : tensor<5x5x3x32xf32>,
 
   // CHECK: return [[RES]] :  memref<5x5x9x32xf32>
 }
-
-// -----
-
-func @test_pool_general_computation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
-  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> ((s2 ceildiv s4) * s4 - s2, d0 * s3 - s2)>
-  // CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0, d0 * s3 + (s1 - 1) * s4 - s2 + 1)>
-  // CHECK-DAG: #{{.*}} = affine_map<() -> (0)>
-  // CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0 - ((s2 ceildiv s4) * s4 - s2), -(d0 * s3 - s2) + s0, d0 * s3 + (s1 - 1) * s4 - s2 - ((s2 ceildiv s4) * s4 - s2) + 1, d0 * s3 + (s1 - 1) * s4 - s2 - (d0 * s3 - s2) + 1)>
-
-  // CHECK-LABEL: @test_pool_general_computation
-
-  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
-  // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
-
-  // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
-  // CHECK: krnl.iterate([[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
-
-  // CHECK:   affine.store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-
-  // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
-  // CHECK:   krnl.iterate([[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #{{.*}}(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #{{.*}}(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
-  // CHECK:     {{.*}} = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
-  // CHECK:     {{.*}} = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:     affine.store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:   }
-
-  // CHECK:   {{.*}} = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:   affine.store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK: }
-}
-
-// -----
-
-func @test_pool_unknown_dimensions(%arg0 : tensor<1x3x?x32xf32>) -> tensor<*xf32> {
-  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x?x32xf32>) -> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-DAG: #[[AFFINE_MAP:.+]] = affine_map<(d0)[s0, s1, s2, s3] -> ((d0 + s1 - (s0 - 1) * s3 - 1) floordiv s2 + 1)>
-  // CHECK-LABEL: test_pool_unknown_dimensions
-  // CHECK: [[C0:%.+]] = constant 2 : index
-  // CHECK: [[DIM:%.+]] = dim %arg0, [[C0]] : memref<1x3x?x32xf32>
-  // CHECK: [[KERNEL:%.+]] = constant 2 : index
-  // CHECK: [[PAD:%.+]] = constant 0 : index
-  // CHECK: [[STRIDE:%.+]] = constant 1 : index
-  // CHECK: [[DILATION:%.+]] = constant 1 : index
-  // CHECK: [[AFFINE_APPLY:%.+]] = affine.apply #[[AFFINE_MAP]]([[DIM]]){{.*}}[[KERNEL]], [[PAD]], [[STRIDE]], [[DILATION]]{{.*}}
-  // CHECK: [[RES:%.+]] = alloc([[AFFINE_APPLY]]) : memref<1x3x?x31xf32>
-}
-
-// -----
-
-func @test_averagepool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
-  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: @test_averagepool_identity_value
-  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
-  // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
-  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-}
-
-// -----
-
-func @test_maxpool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
-  %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: @test_maxpool_identity_value
-  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
-  // CHECK: [[IDENTITY:%.+]] = constant 0xFF800000 : f32
-  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-}
-
-// -----
-
-func @test_averagepool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
-  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: @test_averagepool_pooling_operation
-  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
-
-  // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
-  // CHECK: krnl.iterate([[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
-
-  // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
-  // CHECK:   krnl.iterate([[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #{{.*}}(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #{{.*}}(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
-
-  // CHECK:     [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
-  // CHECK:     [[OUTPUT_LOAD:%.+]] = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:     [[SUM:%.+]] = addf [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
-  // CHECK:     affine.store [[SUM]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:   }
-
-  // CHECK:   [[NUMERATOR:%.+]] = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:   [[AVERAGE:%.+]] = divf [[NUMERATOR]], {{.*}} : f32
-  // CHECK:   affine.store [[AVERAGE]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK: }
-}
-
-// -----
-
-func @test_maxpool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
-  %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: @test_maxpool_pooling_operation
-  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
-
-  // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
-  // CHECK: krnl.iterate([[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
-
-  // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
-  // CHECK:   krnl.iterate([[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #{{.*}}(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #{{.*}}(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
-
-  // CHECK:     [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
-  // CHECK:     [[OUTPUT_LOAD:%.+]] = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:     [[GREATER:%.+]] = cmpf "ogt", [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
-  // CHECK:     [[SELECT:%.+]] = select [[GREATER]], [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
-  // CHECK:     affine.store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK:   }
-
-  // CHECK-NOT:   {{.*}} = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK-NOT:   affine.store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
-  // CHECK: }
-}
-
-// -----
-
-func @test_lstm_general_computation(%arg0: tensor<4x3x2xf32>, %arg1: tensor<1x12x2xf32>, %arg2: tensor<1x12x3xf32>) -> tensor<*xf32> {
-  %cst = constant unit
-  %Y, %Y_h, %Y_c = "onnx.LSTM"(%arg0, %arg1, %arg2, %cst, %cst, %cst, %cst, %cst) {hidden_size = 3 : i64} : (tensor<4x3x2xf32>, tensor<1x12x2xf32>, tensor<1x12x3xf32>, none, none, none, none, none) -> (none, tensor<*xf32>, none)
-  return %Y_h : tensor<*xf32>
-
-  // CHECK-DAG: [[ACCESS_BY_OFFSET_MAP:#.+]] = affine_map<(d0)[s0, s1] -> (d0 + s0 * s1)>
-  // CHECK-LABEL: @test_lstm_general_computation
-
-  // CHECK:  [[CELL_STATE:%.+]] = alloc() : memref<1x3x3xf32>
-  // CHECK:  [[HIDDEN_STATE:%.+]] = alloc() : memref<1x3x3xf32>
-  // CHECK:  {{.*}} = constant unit
-
-  // CHECK:  [[INITIAL_VALUE:%.+]] = constant 0.000000e+00 : f32
-  // CHECK:  [[INITIALIZE_LOOPS:%.+]]:3 = krnl.define_loops 3
-  // CHECK:  krnl.iterate([[INITIALIZE_LOOPS]]#0, [[INITIALIZE_LOOPS]]#1, [[INITIALIZE_LOOPS]]#2) with ([[INITIALIZE_LOOPS]]#0 -> %arg3 = 0 to 1, [[INITIALIZE_LOOPS]]#1 -> %arg4 = 0 to 3, [[INITIALIZE_LOOPS]]#2 -> %arg5 = 0 to 3) {
-  // CHECK:    affine.store [[INITIAL_VALUE]], [[HIDDEN_STATE]][%arg3, %arg4, %arg5] : memref<1x3x3xf32>
-  // CHECK:    affine.store [[INITIAL_VALUE]], [[CELL_STATE]][%arg3, %arg4, %arg5] : memref<1x3x3xf32>
-  // CHECK:  }
-
-  // CHECK:  [[SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
-  // CHECK:  krnl.iterate([[SEQUENCE_LOOPS]]) with ([[SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
-  // CHECK:    {{.*}} = constant 0 : index
-  // CHECK:    {{.*}} = constant 3 : index
-  // CHECK:    {{.*}} = constant 0 : index
-  // CHECK:    {{.*}} = constant 1 : index
-  // CHECK:    {{.*}} = constant 2 : index
-  // CHECK:    {{.*}} = constant 3 : index
-  // CHECK:    {{.*}} = constant 4 : index
-  // CHECK:    {{.*}} = constant 5 : index
-  // CHECK:    {{.*}} = constant 6 : index
-  // CHECK:    {{.*}} = constant 7 : index
-  // CHECK:    [[DATA_LOOPS:%.+]]:2 = krnl.define_loops 2
-  // CHECK:    krnl.iterate([[DATA_LOOPS]]#0, [[DATA_LOOPS]]#1) with ([[DATA_LOOPS]]#0 -> %arg4 = 0 to 3, [[DATA_LOOPS]]#1 -> %arg5 = 0 to 3) {
-  // CHECK:      [[hCt:%.+]] = alloc() : memref<f32>
-  // CHECK:      [[Ot:%.+]] = alloc() : memref<f32>
-  // CHECK:      [[ct:%.+]] = alloc() : memref<f32>
-  // CHECK:      [[Ft:%.+]] = alloc() : memref<f32>
-  // CHECK:      [[It:%.+]] = alloc() : memref<f32>
-  // CHECK:      [[Ht1_LOAD:%.+]] = affine.load [[HIDDEN_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
-  // CHECK:      [[Ct1_LOAD:%.+]] = affine.load [[CELL_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
-
-  // CHECK:      [[ZERO_FLOAT:%.+]] = constant 0.000000e+00 : f32
-  // CHECK:      [[XtWi_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWi_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Ri_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Ri_GEMM]][] : memref<f32>
-  // CHECK:      [[XtWo_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWo_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Ro_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Ro_GEMM]][] : memref<f32>
-  // CHECK:      [[XtWf_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWf_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Rf_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Rf_GEMM]][] : memref<f32>
-  // CHECK:      [[XtWc_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWc_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Rc_GEMM:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Rc_GEMM]][] : memref<f32>
-
-  // CHECK:      [[REDUCTION_LOOPS:%.+]] = krnl.define_loops 1
-  // CHECK:      krnl.iterate([[REDUCTION_LOOPS]]) with ([[REDUCTION_LOOPS]] -> %arg6 = 0 to 2) {
-  // CHECK:        [[INPUT_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c0_1, %c3]
-  // CHECK:        [[OUTPUT_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c1, %c3]
-  // CHECK:        [[FORGET_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c2, %c3]
-  // CHECK:        [[CELL_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c3_2, %c3]
-  // CHECK:        [[Xt_LOAD:%.+]] = affine.load %arg0[%arg3, %arg4, %arg6] : memref<4x3x2xf32>
-
-  // CHECK:        [[Wi_LOAD:%.+]] = affine.load %arg1[%c0, [[INPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
-  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wi_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[XtWi_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[XtWi_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Ri_LOAD:%.+]] = affine.load %arg2[%c0, [[INPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
-  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Ri_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[Ht1Ri_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[Ht1Ri_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Wo_LOAD:%.+]] = affine.load %arg1[%c0, [[OUTPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
-  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wo_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[XtWo_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[XtWo_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Ro_LOAD:%.+]] = affine.load %arg2[%c0, [[OUTPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
-  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Ro_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[Ht1Ro_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[Ht1Ro_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Wf_LOAD:%.+]] = affine.load %arg1[%c0, [[FORGET_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
-  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wf_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[XtWf_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[XtWf_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Rf_LOAD:%.+]] = affine.load %arg2[%c0, [[FORGET_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
-  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Rf_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[Ht1Rf_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[Ht1Rf_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Wc_LOAD:%.+]] = affine.load %arg1[%c0, [[CELL_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
-  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wc_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[XtWc_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[XtWc_GEMM]][] : memref<f32>
-
-  // CHECK:        [[Rc_LOAD:%.+]] = affine.load %arg2[%c0, [[CELL_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
-  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Rc_LOAD]] : f32
-  // CHECK:        {{.*}} = affine.load [[Ht1Rc_GEMM]][] : memref<f32>
-  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:        affine.store {{.*}}, [[Ht1Rc_GEMM]][] : memref<f32>
-  // CHECK:      }
-
-  // CHECK:      [[XtWi_LOAD:%.+]] = affine.load [[XtWi_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Ri_LOAD:%.+]] = affine.load [[Ht1Ri_GEMM]][] : memref<f32>
-  // CHECK:      [[It_OUTPUT:%.+]] = addf [[XtWi_LOAD]], [[Ht1Ri_LOAD]] : f32
-
-  // CHECK:      [[SIGMOID_INPUT:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[It_OUTPUT]], [[SIGMOID_INPUT]][] : memref<f32>
-  // CHECK:      {{.*}} = affine.load [[SIGMOID_INPUT]][] : memref<f32>
-  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
-  // CHECK:      {{.*}} = constant 1.000000e+00 : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}}: f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
-  // CHECK:      affine.store {{.*}}, [[It]][] : memref<f32>
-  // CHECK:      [[It_LOAD:%.+]] = affine.load [[It]][] : memref<f32>
-
-  // CHECK:      [[XtWf_LOAD:%.+]] = affine.load [[XtWf_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Rf_LOAD:%.+]] = affine.load [[Ht1Rf_GEMM]][] : memref<f32>
-  // CHECK:      [[Ft_OUTPUT:%.+]] = addf [[XtWf_LOAD]], [[Ht1Rf_LOAD]] : f32
-
-  // CHECK:      [[SIGMOID_FORGET:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[Ft_OUTPUT]], [[SIGMOID_FORGET]][] : memref<f32>
-  // CHECK:      {{.*}} = affine.load [[SIGMOID_FORGET]][] : memref<f32>
-  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
-  // CHECK:      {{.*}} = constant 1.000000e+00 : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}}: f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
-  // CHECK:      affine.store {{.*}}, [[Ft]][] : memref<f32>
-  // CHECK:      [[Ft_LOAD:%.+]] = affine.load [[Ft]][] : memref<f32>
-
-  // CHECK:      [[XtWc_LOAD:%.+]] = affine.load [[XtWc_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Rc_LOAD:%.+]] = affine.load [[Ht1Rc_GEMM]][] : memref<f32>
-  // CHECK:      [[ct_OUTPUT:%.+]] = addf [[XtWc_LOAD]], [[Ht1Rc_LOAD]] : f32
-
-  // CHECK:      [[TANH_CELL:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[ct_OUTPUT]], [[TANH_CELL]][] : memref<f32>
-  // CHECK:      {{.*}} = affine.load [[TANH_CELL]][] : memref<f32>
-  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
-  // CHECK:      affine.store {{.*}}, [[ct]][] : memref<f32>
-  // CHECK:      [[ct_LOAD:%.+]] = affine.load [[ct]][] : memref<f32>
-
-  // CHECK:      [[FtCt1:%.+]] = mulf [[Ft_LOAD]], [[Ct1_LOAD]] : f32
-  // CHECK:      [[Itct:%.+]] = mulf [[It_LOAD]], [[ct_LOAD]] : f32
-  // CHECK:      [[Ct:%.+]] = addf [[FtCt1]], [[Itct]] : f32
-  // CHECK:      affine.store [[Ct]], [[CELL_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
-
-  // CHECK:      [[XtWo_LOAD:%.+]] = affine.load [[XtWo_GEMM]][] : memref<f32>
-  // CHECK:      [[Ht1Ro_LOAD:%.+]] = affine.load [[Ht1Ro_GEMM]][] : memref<f32>
-  // CHECK:      [[Ot_OUTPUT:%.+]] = addf [[XtWo_LOAD]], [[Ht1Ro_LOAD]] : f32
-
-  // CHECK:      [[SIGMOID_OUTPUT:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[Ot_OUTPUT]], [[SIGMOID_OUTPUT]][] : memref<f32>
-  // CHECK:      {{.*}} = affine.load [[SIGMOID_OUTPUT]][] : memref<f32>
-  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
-  // CHECK:      {{.*}} = constant 1.000000e+00 : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}}: f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
-  // CHECK:      affine.store {{.*}}, [[Ot]][] : memref<f32>
-  // CHECK:      [[Ot_LOAD:%.+]] = affine.load [[Ot]][] : memref<f32>
-
-  // CHECK:      [[TANH_HIDDEN:%.+]] = alloc() : memref<f32>
-  // CHECK:      affine.store [[Ct]], [[TANH_HIDDEN]][] : memref<f32>
-  // CHECK:      {{.*}} = affine.load [[TANH_HIDDEN]][] : memref<f32>
-  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = exp {{.*}} : f32
-  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
-  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
-  // CHECK:      affine.store {{.*}}, [[hCt]][] : memref<f32>
-  // CHECK:      [[hCt_LOAD:%.+]] = affine.load [[hCt]][] : memref<f32>
-
-  // CHECK:      [[Ht:%.+]] = mulf [[Ot_LOAD]], [[hCt_LOAD]] : f32
-  // CHECK:      affine.store [[Ht]], [[HIDDEN_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
-
-  // CHECK:      dealloc [[XtWi_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[XtWo_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[XtWf_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[XtWc_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[Ht1Ri_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[Ht1Ro_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[Ht1Rf_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[Ht1Rc_GEMM]] : memref<f32>
-  // CHECK:      dealloc [[It]] : memref<f32>
-  // CHECK:      dealloc [[Ft]] : memref<f32>
-  // CHECK:      dealloc [[ct]] : memref<f32>
-  // CHECK:      dealloc [[Ot]] : memref<f32>
-  // CHECK:      dealloc [[hCt]] : memref<f32>
-  // CHECK:    }
-  // CHECK:  }
-  // CHECK:  dealloc [[CELL_STATE]] : memref<1x3x3xf32>
-  // CHECK:  return [[HIDDEN_STATE]] : memref<1x3x3xf32>
-}
-
-// -----
-
-func @test_lstm_reverse_mode(%arg0: tensor<4x3x2xf32>, %arg1: tensor<1x12x2xf32>, %arg2: tensor<1x12x3xf32>) -> tensor<*xf32> {
-  %cst = constant unit
-  %Y, %Y_h, %Y_c = "onnx.LSTM"(%arg0, %arg1, %arg2, %cst, %cst, %cst, %cst, %cst) {hidden_size = 3 : i64, direction = "reverse"} : (tensor<4x3x2xf32>, tensor<1x12x2xf32>, tensor<1x12x3xf32>, none, none, none, none, none) -> (none, tensor<*xf32>, none)
-  return %Y_h : tensor<*xf32>
-
-  // CHECK: [[REVERSE_IV_MAP:#.+]] = affine_map<(d0)[s0] -> (-d0 + s0 - 1)>
-  // CHECK-LABEL: @test_lstm_reverse_mode
-
-  // CHECK:  [[REVERSE_SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
-  // CHECK:  krnl.iterate([[REVERSE_SEQUENCE_LOOPS]]) with ([[REVERSE_SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
-  // CHECK:  %[[SEQUENCE_LEN:.+]] = constant 4 : index
-  // CHECK:  %[[REVERSE_SEQUENCE_IV:.+]] = affine.apply [[REVERSE_IV_MAP]](%arg3)[%[[SEQUENCE_LEN]]{{]}}
-  // CHECK:  [[Xt_LOAD:%.+]] = affine.load %arg0[%[[REVERSE_SEQUENCE_IV]], {{.*}}, {{.*}}] : memref<4x3x2xf32>
-}
-
-// -----
-
-func @test_lstm_bidirectional_mode(%arg0: tensor<4x3x2xf32>, %arg1: tensor<1x12x2xf32>, %arg2: tensor<1x12x3xf32>) -> tensor<*xf32> {
-  %cst = constant unit
-  %Y, %Y_h, %Y_c = "onnx.LSTM"(%arg0, %arg1, %arg2, %cst, %cst, %cst, %cst, %cst) {hidden_size = 3 : i64, direction = "bidirectional"} : (tensor<4x3x2xf32>, tensor<1x12x2xf32>, tensor<1x12x3xf32>, none, none, none, none, none) -> (none, tensor<*xf32>, none)
-  return %Y_h : tensor<*xf32>
-
-  // CHECK: [[REVERSE_IV_MAP:#.+]] = affine_map<(d0)[s0] -> (-d0 + s0 - 1)>
-  // CHECK-LABEL: @test_lstm_bidirectional_mode
-
-  // CHECK:  [[SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
-  // CHECK:  krnl.iterate([[SEQUENCE_LOOPS]]) with ([[SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
-  // CHECK:  [[Xt_LOAD:%.+]] = affine.load %arg0[%arg3, {{.*}}, {{.*}}] : memref<4x3x2xf32>
-
-  // CHECK:  [[REVERSE_SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
-  // CHECK:  krnl.iterate([[REVERSE_SEQUENCE_LOOPS]]) with ([[REVERSE_SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
-  // CHECK:  %[[SEQUENCE_LEN:.+]] = constant 4 : index
-  // CHECK:  %[[REVERSE_SEQUENCE_IV:.+]] = affine.apply [[REVERSE_IV_MAP]](%arg3)[%[[SEQUENCE_LEN]]{{]}}
-  // CHECK:  [[Xt_LOAD:%.+]] = affine.load %arg0[%[[REVERSE_SEQUENCE_IV]], {{.*}}, {{.*}}] : memref<4x3x2xf32>
-}
-
-// -----
-
-func @test_squeeze(%arg0 : tensor<16x1x32x1x64xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Squeeze"(%arg0) { axes = [1, -2]} : (tensor<16x1x32x1x64xf32>) -> (tensor<*xf32>)
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: @test_squeeze
-  // CHECK: [[RES:%.+]] = alloc() : memref<16x32x64xf32>
-  // CHECK: [[TENSOR_SIZE:%.+]] = constant 131072 : i64
-  // CHECK: "krnl.memcpy"([[RES]], %arg0, [[TENSOR_SIZE]]) : (memref<16x32x64xf32>, memref<16x1x32x1x64xf32>, i64) -> ()
-  // CHECK: return [[RES]] : memref<16x32x64xf32>
-}
-
-// -----
-
-func @test_squeeze_unknown_dimensions(%arg0 : tensor<?x1x32x?x64xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Squeeze"(%arg0) { axes = [1,-2]} : (tensor<?x1x32x?x64xf32>) -> (tensor<*xf32>)
-  "std.return"(%0) : (tensor<*xf32>) -> ()
-
-  // CHECK-LABEL: @test_squeeze_unknown_dimensions
-  // CHECK: [[C0:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x1x32x?x64xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x32x64xf32>
-  // CHECK: [[TENSOR_SIZE_0:%.+]] = constant 8192 : i64
-  // CHECK: [[DIM_0_i64:%.+]] = index_cast [[DIM_0]] : index to i64
-  // CHECK: [[TENSOR_SIZE_1:%.+]] = muli [[TENSOR_SIZE_0]], [[DIM_0_i64]] : i64
-  // CHECK: "krnl.memcpy"([[RES]], %arg0, [[TENSOR_SIZE_1]]) : (memref<?x32x64xf32>, memref<?x1x32x?x64xf32>, i64) -> ()
-  // CHECK: return [[RES]] : memref<?x32x64xf32>
-}
-
-// -----
-
-func @test_split_equal(%arg0 : tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
-  %0, %1 = "onnx.Split"(%arg0) { axis = 0} : (tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>)
-  "std.return"(%0, %1) : (tensor<*xf32>, tensor<*xf32>) -> ()
-
-  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0) -> (d0 + 8)>
-  // CHECK-LABEL: @test_split_equal
-
-  // CHECK: [[RES_1:%.+]] = alloc() : memref<8x32x64xf32>
-  // CHECK: [[RES_0:%.+]] = alloc() : memref<8x32x64xf32>
-  // CHECK: [[DEF_LOOP_0:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOP_0]]#0, [[DEF_LOOP_0]]#1, [[DEF_LOOP_0]]#2) with ([[DEF_LOOP_0]]#0 -> %arg1 = 0 to 8, [[DEF_LOOP_0]]#1 -> %arg2 = 0 to 32, [[DEF_LOOP_0]]#2 -> %arg3 = 0 to 64) {
-  // CHECK:   [[LOAD_0:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<16x32x64xf32>
-  // CHECK:   affine.store [[LOAD_0]], [[RES_0]][%arg1, %arg2, %arg3] : memref<8x32x64xf32>
-  // CHECK: }
-  // CHECK: [[DEF_LOOP_1:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOP_1]]#0, [[DEF_LOOP_1]]#1, [[DEF_LOOP_1]]#2) with ([[DEF_LOOP_1]]#0 -> %arg1 = 0 to 8, [[DEF_LOOP_1]]#1 -> %arg2 = 0 to 32, [[DEF_LOOP_1]]#2 -> %arg3 = 0 to 64) {
-  // CHECK:   %[[INDEX:.+]] = affine.apply [[INDEX_MAP]](%arg1)
-  // CHECK:   [[LOAD_1:%.+]] = affine.load %arg0[%[[INDEX]], %arg2, %arg3] : memref<16x32x64xf32>
-  // CHECK:   affine.store [[LOAD_1]], [[RES_1]][%arg1, %arg2, %arg3] : memref<8x32x64xf32>
-  // CHECK: }
-  // CHECK: return [[RES_0]], [[RES_1]] : memref<8x32x64xf32>, memref<8x32x64xf32>
-}
-
-// -----
-
-func @test_split_variable(%arg0 : tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
-  %0, %1 = "onnx.Split"(%arg0) { axis = 1, split = [2, 30]} : (tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>)
-  "std.return"(%0, %1) : (tensor<*xf32>, tensor<*xf32>) -> ()
-
-  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0) -> (d0 + 2)>
-  // CHECK-LABEL: @test_split_variable
-
-  // CHECK: [[RES_1:%.+]] = alloc() : memref<16x30x64xf32>
-  // CHECK: [[RES_0:%.+]] = alloc() : memref<16x2x64xf32>
-  // CHECK: [[DEF_LOOP_0:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOP_0]]#0, [[DEF_LOOP_0]]#1, [[DEF_LOOP_0]]#2) with ([[DEF_LOOP_0]]#0 -> %arg1 = 0 to 16, [[DEF_LOOP_0]]#1 -> %arg2 = 0 to 2, [[DEF_LOOP_0]]#2 -> %arg3 = 0 to 64) {
-  // CHECK:   [[LOAD_0:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<16x32x64xf32>
-  // CHECK:   affine.store [[LOAD_0]], [[RES_0]][%arg1, %arg2, %arg3] : memref<16x2x64xf32>
-  // CHECK: }
-  // CHECK: [[DEF_LOOP_1:%.+]]:3 = krnl.define_loops 3
-  // CHECK: krnl.iterate([[DEF_LOOP_1]]#0, [[DEF_LOOP_1]]#1, [[DEF_LOOP_1]]#2) with ([[DEF_LOOP_1]]#0 -> %arg1 = 0 to 16, [[DEF_LOOP_1]]#1 -> %arg2 = 0 to 30, [[DEF_LOOP_1]]#2 -> %arg3 = 0 to 64) {
-  // CHECK:   %[[INDEX:.+]] = affine.apply [[INDEX_MAP]](%arg2)
-  // CHECK:   [[LOAD_1:%.+]] = affine.load %arg0[%arg1, %[[INDEX]], %arg3] : memref<16x32x64xf32>
-  // CHECK:   affine.store [[LOAD_1]], [[RES_1]][%arg1, %arg2, %arg3] : memref<16x30x64xf32>
-  // CHECK: }
-  // CHECK: return [[RES_0]], [[RES_1]] : memref<16x2x64xf32>, memref<16x30x64xf32>
-}
-
-// -----
-
-func @test_split_unknown_dimension(%arg0 : tensor<?x?x64xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
-  %0, %1 = "onnx.Split"(%arg0) { axis = 1, split = [2, 30]} : (tensor<?x?x64xf32>) -> (tensor<*xf32>, tensor<*xf32>)
-  "std.return"(%0, %1) : (tensor<*xf32>, tensor<*xf32>) -> ()
-
-  // CHECK: [[INDEX_MAP:#.+]] = affine_map<(d0) -> (d0 + 2)>
-  // CHECK-LABEL: @test_split_unknown_dimension
-
-  // CHECK: [[C0:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x?x64xf32>
-  // CHECK: [[RES_0:%.+]] = alloc([[DIM_0]]) : memref<?x2x64xf32>
-  // CHECK: [[C0_0:%.+]] = constant 0 : index
-  // CHECK: [[DIM_1:%.+]] = dim %arg0, [[C0_0]] : memref<?x?x64xf32>
-  // CHECK: [[RES_1:%.+]] = alloc([[DIM_1]]) : memref<?x30x64xf32>
-  // CHECK: [[DEF_LOOP_0:%.+]]:3 = krnl.define_loops 3
-  // CHECK: [[C0_2:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES_0]], [[C0_2]] : memref<?x2x64xf32>
-  // CHECK: krnl.iterate([[DEF_LOOP_0]]#0, [[DEF_LOOP_0]]#1, [[DEF_LOOP_0]]#2) with ([[DEF_LOOP_0]]#0 -> %arg1 = 0 to [[DIM_0]], [[DEF_LOOP_0]]#1 -> %arg2 = 0 to 2, [[DEF_LOOP_0]]#2 -> %arg3 = 0 to 64) {
-  // CHECK:   [[LOAD_0:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<?x?x64xf32>
-  // CHECK:   affine.store [[LOAD_0]], [[RES_0]][%arg1, %arg2, %arg3] : memref<?x2x64xf32>
-  // CHECK: }
-  // CHECK: [[DEF_LOOP_1:%.+]]:3 = krnl.define_loops 3
-  // CHECK: [[C0_3:%.+]] = constant 0 : index
-  // CHECK: [[DIM_1:%.+]] = dim [[RES_1]], [[C0_3]] : memref<?x30x64xf32>
-  // CHECK: krnl.iterate([[DEF_LOOP_1]]#0, [[DEF_LOOP_1]]#1, [[DEF_LOOP_1]]#2) with ([[DEF_LOOP_1]]#0 -> %arg1 = 0 to [[DIM_1]], [[DEF_LOOP_1]]#1 -> %arg2 = 0 to 30, [[DEF_LOOP_1]]#2 -> %arg3 = 0 to 64) {
-  // CHECK:   %[[INDEX:.+]] = affine.apply [[INDEX_MAP]](%arg2)
-  // CHECK:   [[LOAD_1:%.+]] = affine.load %arg0[%arg1, %[[INDEX]], %arg3] : memref<?x?x64xf32>
-  // CHECK:   affine.store [[LOAD_1]], [[RES_1]][%arg1, %arg2, %arg3] : memref<?x30x64xf32>
-  // CHECK: }
-  // CHECK: return [[RES_0]], [[RES_1]] : memref<?x2x64xf32>, memref<?x30x64xf32>
-}
diff --git a/test/mlir/onnx/onnx_lowering_lstm.mlir b/test/mlir/onnx/onnx_lowering_lstm.mlir
new file mode 100644
index 0000000..92a90ed
--- /dev/null
+++ b/test/mlir/onnx/onnx_lowering_lstm.mlir
@@ -0,0 +1,263 @@
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend %s -split-input-file | FileCheck %s
+
+func @test_lstm_general_computation(%arg0: tensor<4x3x2xf32>, %arg1: tensor<1x12x2xf32>, %arg2: tensor<1x12x3xf32>) -> tensor<*xf32> {
+  %cst = constant unit
+  %Y, %Y_h, %Y_c = "onnx.LSTM"(%arg0, %arg1, %arg2, %cst, %cst, %cst, %cst, %cst) {hidden_size = 3 : i64} : (tensor<4x3x2xf32>, tensor<1x12x2xf32>, tensor<1x12x3xf32>, none, none, none, none, none) -> (none, tensor<*xf32>, none)
+  return %Y_h : tensor<*xf32>
+
+  // CHECK-DAG: [[ACCESS_BY_OFFSET_MAP:#.+]] = affine_map<(d0)[s0, s1] -> (d0 + s0 * s1)>
+
+  // CHECK-LABEL: @test_lstm_general_computation
+
+  // CHECK:  [[CELL_STATE:%.+]] = alloc() : memref<1x3x3xf32>
+  // CHECK:  [[HIDDEN_STATE:%.+]] = alloc() : memref<1x3x3xf32>
+  // CHECK:  {{.*}} = constant unit
+
+  // CHECK:  [[INITIAL_VALUE:%.+]] = constant 0.000000e+00 : f32
+  // CHECK:  [[INITIALIZE_LOOPS:%.+]]:3 = krnl.define_loops 3
+  // CHECK:  krnl.iterate([[INITIALIZE_LOOPS]]#0, [[INITIALIZE_LOOPS]]#1, [[INITIALIZE_LOOPS]]#2) with ([[INITIALIZE_LOOPS]]#0 -> %arg3 = 0 to 1, [[INITIALIZE_LOOPS]]#1 -> %arg4 = 0 to 3, [[INITIALIZE_LOOPS]]#2 -> %arg5 = 0 to 3) {
+  // CHECK:    affine.store [[INITIAL_VALUE]], [[HIDDEN_STATE]][%arg3, %arg4, %arg5] : memref<1x3x3xf32>
+  // CHECK:    affine.store [[INITIAL_VALUE]], [[CELL_STATE]][%arg3, %arg4, %arg5] : memref<1x3x3xf32>
+  // CHECK:  }
+
+  // CHECK:  [[SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
+  // CHECK:  krnl.iterate([[SEQUENCE_LOOPS]]) with ([[SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
+  // CHECK:    {{.*}} = constant 0 : index
+  // CHECK:    {{.*}} = constant 3 : index
+  // CHECK:    {{.*}} = constant 0 : index
+  // CHECK:    {{.*}} = constant 1 : index
+  // CHECK:    {{.*}} = constant 2 : index
+  // CHECK:    {{.*}} = constant 3 : index
+  // CHECK:    {{.*}} = constant 4 : index
+  // CHECK:    {{.*}} = constant 5 : index
+  // CHECK:    {{.*}} = constant 6 : index
+  // CHECK:    {{.*}} = constant 7 : index
+  // CHECK:    [[DATA_LOOPS:%.+]]:2 = krnl.define_loops 2
+  // CHECK:    krnl.iterate([[DATA_LOOPS]]#0, [[DATA_LOOPS]]#1) with ([[DATA_LOOPS]]#0 -> %arg4 = 0 to 3, [[DATA_LOOPS]]#1 -> %arg5 = 0 to 3) {
+  // CHECK:      [[hCt:%.+]] = alloc() : memref<f32>
+  // CHECK:      [[Ot:%.+]] = alloc() : memref<f32>
+  // CHECK:      [[ct:%.+]] = alloc() : memref<f32>
+  // CHECK:      [[Ft:%.+]] = alloc() : memref<f32>
+  // CHECK:      [[It:%.+]] = alloc() : memref<f32>
+  // CHECK:      [[Ht1_LOAD:%.+]] = affine.load [[HIDDEN_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
+  // CHECK:      [[Ct1_LOAD:%.+]] = affine.load [[CELL_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
+
+  // CHECK:      [[ZERO_FLOAT:%.+]] = constant 0.000000e+00 : f32
+  // CHECK:      [[XtWi_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWi_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Ri_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Ri_GEMM]][] : memref<f32>
+  // CHECK:      [[XtWo_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWo_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Ro_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Ro_GEMM]][] : memref<f32>
+  // CHECK:      [[XtWf_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWf_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Rf_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Rf_GEMM]][] : memref<f32>
+  // CHECK:      [[XtWc_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[XtWc_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Rc_GEMM:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ZERO_FLOAT]], [[Ht1Rc_GEMM]][] : memref<f32>
+
+  // CHECK:      [[REDUCTION_LOOPS:%.+]] = krnl.define_loops 1
+  // CHECK:      krnl.iterate([[REDUCTION_LOOPS]]) with ([[REDUCTION_LOOPS]] -> %arg6 = 0 to 2) {
+  // CHECK:        [[INPUT_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c0_1, %c3]
+  // CHECK:        [[OUTPUT_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c1, %c3]
+  // CHECK:        [[FORGET_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c2, %c3]
+  // CHECK:        [[CELL_HIDDEN_INDEX:%.+]] = affine.apply #{{.*}}(%arg5)[%c3_2, %c3]
+  // CHECK:        [[Xt_LOAD:%.+]] = affine.load %arg0[%arg3, %arg4, %arg6] : memref<4x3x2xf32>
+
+  // CHECK:        [[Wi_LOAD:%.+]] = affine.load %arg1[%c0, [[INPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
+  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wi_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[XtWi_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[XtWi_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Ri_LOAD:%.+]] = affine.load %arg2[%c0, [[INPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
+  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Ri_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[Ht1Ri_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[Ht1Ri_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Wo_LOAD:%.+]] = affine.load %arg1[%c0, [[OUTPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
+  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wo_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[XtWo_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[XtWo_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Ro_LOAD:%.+]] = affine.load %arg2[%c0, [[OUTPUT_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
+  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Ro_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[Ht1Ro_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[Ht1Ro_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Wf_LOAD:%.+]] = affine.load %arg1[%c0, [[FORGET_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
+  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wf_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[XtWf_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[XtWf_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Rf_LOAD:%.+]] = affine.load %arg2[%c0, [[FORGET_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
+  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Rf_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[Ht1Rf_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[Ht1Rf_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Wc_LOAD:%.+]] = affine.load %arg1[%c0, [[CELL_HIDDEN_INDEX]], %arg6] : memref<1x12x2xf32>
+  // CHECK:        {{.*}} = mulf [[Xt_LOAD]], [[Wc_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[XtWc_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[XtWc_GEMM]][] : memref<f32>
+
+  // CHECK:        [[Rc_LOAD:%.+]] = affine.load %arg2[%c0, [[CELL_HIDDEN_INDEX]], %arg6] : memref<1x12x3xf32>
+  // CHECK:        {{.*}} = mulf [[Ht1_LOAD]], [[Rc_LOAD]] : f32
+  // CHECK:        {{.*}} = affine.load [[Ht1Rc_GEMM]][] : memref<f32>
+  // CHECK:        {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:        affine.store {{.*}}, [[Ht1Rc_GEMM]][] : memref<f32>
+  // CHECK:      }
+
+  // CHECK:      [[XtWi_LOAD:%.+]] = affine.load [[XtWi_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Ri_LOAD:%.+]] = affine.load [[Ht1Ri_GEMM]][] : memref<f32>
+  // CHECK:      [[It_OUTPUT:%.+]] = addf [[XtWi_LOAD]], [[Ht1Ri_LOAD]] : f32
+
+  // CHECK:      [[SIGMOID_INPUT:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[It_OUTPUT]], [[SIGMOID_INPUT]][] : memref<f32>
+  // CHECK:      {{.*}} = affine.load [[SIGMOID_INPUT]][] : memref<f32>
+  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
+  // CHECK:      {{.*}} = constant 1.000000e+00 : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}}: f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
+  // CHECK:      affine.store {{.*}}, [[It]][] : memref<f32>
+  // CHECK:      [[It_LOAD:%.+]] = affine.load [[It]][] : memref<f32>
+
+  // CHECK:      [[XtWf_LOAD:%.+]] = affine.load [[XtWf_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Rf_LOAD:%.+]] = affine.load [[Ht1Rf_GEMM]][] : memref<f32>
+  // CHECK:      [[Ft_OUTPUT:%.+]] = addf [[XtWf_LOAD]], [[Ht1Rf_LOAD]] : f32
+
+  // CHECK:      [[SIGMOID_FORGET:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[Ft_OUTPUT]], [[SIGMOID_FORGET]][] : memref<f32>
+  // CHECK:      {{.*}} = affine.load [[SIGMOID_FORGET]][] : memref<f32>
+  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
+  // CHECK:      {{.*}} = constant 1.000000e+00 : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}}: f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
+  // CHECK:      affine.store {{.*}}, [[Ft]][] : memref<f32>
+  // CHECK:      [[Ft_LOAD:%.+]] = affine.load [[Ft]][] : memref<f32>
+
+  // CHECK:      [[XtWc_LOAD:%.+]] = affine.load [[XtWc_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Rc_LOAD:%.+]] = affine.load [[Ht1Rc_GEMM]][] : memref<f32>
+  // CHECK:      [[ct_OUTPUT:%.+]] = addf [[XtWc_LOAD]], [[Ht1Rc_LOAD]] : f32
+
+  // CHECK:      [[TANH_CELL:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[ct_OUTPUT]], [[TANH_CELL]][] : memref<f32>
+  // CHECK:      {{.*}} = affine.load [[TANH_CELL]][] : memref<f32>
+  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
+  // CHECK:      affine.store {{.*}}, [[ct]][] : memref<f32>
+  // CHECK:      [[ct_LOAD:%.+]] = affine.load [[ct]][] : memref<f32>
+
+  // CHECK:      [[FtCt1:%.+]] = mulf [[Ft_LOAD]], [[Ct1_LOAD]] : f32
+  // CHECK:      [[Itct:%.+]] = mulf [[It_LOAD]], [[ct_LOAD]] : f32
+  // CHECK:      [[Ct:%.+]] = addf [[FtCt1]], [[Itct]] : f32
+  // CHECK:      affine.store [[Ct]], [[CELL_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
+
+  // CHECK:      [[XtWo_LOAD:%.+]] = affine.load [[XtWo_GEMM]][] : memref<f32>
+  // CHECK:      [[Ht1Ro_LOAD:%.+]] = affine.load [[Ht1Ro_GEMM]][] : memref<f32>
+  // CHECK:      [[Ot_OUTPUT:%.+]] = addf [[XtWo_LOAD]], [[Ht1Ro_LOAD]] : f32
+
+  // CHECK:      [[SIGMOID_OUTPUT:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[Ot_OUTPUT]], [[SIGMOID_OUTPUT]][] : memref<f32>
+  // CHECK:      {{.*}} = affine.load [[SIGMOID_OUTPUT]][] : memref<f32>
+  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
+  // CHECK:      {{.*}} = constant 1.000000e+00 : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}}: f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
+  // CHECK:      affine.store {{.*}}, [[Ot]][] : memref<f32>
+  // CHECK:      [[Ot_LOAD:%.+]] = affine.load [[Ot]][] : memref<f32>
+
+  // CHECK:      [[TANH_HIDDEN:%.+]] = alloc() : memref<f32>
+  // CHECK:      affine.store [[Ct]], [[TANH_HIDDEN]][] : memref<f32>
+  // CHECK:      {{.*}} = affine.load [[TANH_HIDDEN]][] : memref<f32>
+  // CHECK:      {{.*}} = constant 0.000000e+00 : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = exp {{.*}} : f32
+  // CHECK:      {{.*}} = subf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = addf {{.*}}, {{.*}} : f32
+  // CHECK:      {{.*}} = divf {{.*}}, {{.*}} : f32
+  // CHECK:      affine.store {{.*}}, [[hCt]][] : memref<f32>
+  // CHECK:      [[hCt_LOAD:%.+]] = affine.load [[hCt]][] : memref<f32>
+
+  // CHECK:      [[Ht:%.+]] = mulf [[Ot_LOAD]], [[hCt_LOAD]] : f32
+  // CHECK:      affine.store [[Ht]], [[HIDDEN_STATE]][%c0, %arg4, %arg5] : memref<1x3x3xf32>
+
+  // CHECK:      dealloc [[XtWi_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[XtWo_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[XtWf_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[XtWc_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[Ht1Ri_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[Ht1Ro_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[Ht1Rf_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[Ht1Rc_GEMM]] : memref<f32>
+  // CHECK:      dealloc [[It]] : memref<f32>
+  // CHECK:      dealloc [[Ft]] : memref<f32>
+  // CHECK:      dealloc [[ct]] : memref<f32>
+  // CHECK:      dealloc [[Ot]] : memref<f32>
+  // CHECK:      dealloc [[hCt]] : memref<f32>
+  // CHECK:    }
+  // CHECK:  }
+  // CHECK:  dealloc [[CELL_STATE]] : memref<1x3x3xf32>
+  // CHECK:  return [[HIDDEN_STATE]] : memref<1x3x3xf32>
+}
+
+// -----
+
+func @test_lstm_reverse_mode(%arg0: tensor<4x3x2xf32>, %arg1: tensor<1x12x2xf32>, %arg2: tensor<1x12x3xf32>) -> tensor<*xf32> {
+  %cst = constant unit
+  %Y, %Y_h, %Y_c = "onnx.LSTM"(%arg0, %arg1, %arg2, %cst, %cst, %cst, %cst, %cst) {hidden_size = 3 : i64, direction = "reverse"} : (tensor<4x3x2xf32>, tensor<1x12x2xf32>, tensor<1x12x3xf32>, none, none, none, none, none) -> (none, tensor<*xf32>, none)
+  return %Y_h : tensor<*xf32>
+
+  // CHECK-DAG: [[REVERSE_IV_MAP1:#.+]] = affine_map<(d0)[s0] -> (-d0 + s0 - 1)>
+
+  // CHECK-LABEL: @test_lstm_reverse_mode
+
+  // CHECK:  [[REVERSE_SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
+  // CHECK:  krnl.iterate([[REVERSE_SEQUENCE_LOOPS]]) with ([[REVERSE_SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
+  // CHECK:  %[[SEQUENCE_LEN:.+]] = constant 4 : index
+  // CHECK:  %[[REVERSE_SEQUENCE_IV:.+]] = affine.apply [[REVERSE_IV_MAP1]](%arg3)[%[[SEQUENCE_LEN]]{{]}}
+  // CHECK:  [[Xt_LOAD:%.+]] = affine.load %arg0[%[[REVERSE_SEQUENCE_IV]], {{.*}}, {{.*}}] : memref<4x3x2xf32>
+}
+
+// -----
+
+func @test_lstm_bidirectional_mode(%arg0: tensor<4x3x2xf32>, %arg1: tensor<1x12x2xf32>, %arg2: tensor<1x12x3xf32>) -> tensor<*xf32> {
+  %cst = constant unit
+  %Y, %Y_h, %Y_c = "onnx.LSTM"(%arg0, %arg1, %arg2, %cst, %cst, %cst, %cst, %cst) {hidden_size = 3 : i64, direction = "bidirectional"} : (tensor<4x3x2xf32>, tensor<1x12x2xf32>, tensor<1x12x3xf32>, none, none, none, none, none) -> (none, tensor<*xf32>, none)
+  return %Y_h : tensor<*xf32>
+
+  // CHECK-DAG: [[REVERSE_IV_MAP1:#.+]] = affine_map<(d0)[s0] -> (-d0 + s0 - 1)>
+
+  // CHECK-LABEL: @test_lstm_bidirectional_mode
+
+  // CHECK:  [[SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
+  // CHECK:  krnl.iterate([[SEQUENCE_LOOPS]]) with ([[SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
+  // CHECK:  [[Xt_LOAD:%.+]] = affine.load %arg0[%arg3, {{.*}}, {{.*}}] : memref<4x3x2xf32>
+
+  // CHECK:  [[REVERSE_SEQUENCE_LOOPS:%.+]] = krnl.define_loops 1
+  // CHECK:  krnl.iterate([[REVERSE_SEQUENCE_LOOPS]]) with ([[REVERSE_SEQUENCE_LOOPS]] -> %arg3 = 0 to 4) {
+  // CHECK:  %[[SEQUENCE_LEN:.+]] = constant 4 : index
+  // CHECK:  %[[REVERSE_SEQUENCE_IV:.+]] = affine.apply [[REVERSE_IV_MAP1]](%arg3)[%[[SEQUENCE_LEN]]{{]}}
+  // CHECK:  [[Xt_LOAD:%.+]] = affine.load %arg0[%[[REVERSE_SEQUENCE_IV]], {{.*}}, {{.*}}] : memref<4x3x2xf32>
+}
diff --git a/test/mlir/onnx/onnx_lowering_pooling.mlir b/test/mlir/onnx/onnx_lowering_pooling.mlir
new file mode 100644
index 0000000..c4e4ae8
--- /dev/null
+++ b/test/mlir/onnx/onnx_lowering_pooling.mlir
@@ -0,0 +1,121 @@
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend %s | FileCheck %s
+
+// CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> ((s2 ceildiv s4) * s4 - s2, d0 * s3 - s2)>
+// CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0, d0 * s3 + (s1 - 1) * s4 - s2 + 1)>
+// CHECK-DAG: #{{.*}} = affine_map<() -> (0)>
+// CHECK-DAG: #{{.*}} = affine_map<(d0)[s0, s1, s2, s3, s4] -> (s0 - ((s2 ceildiv s4) * s4 - s2), -(d0 * s3 - s2) + s0, d0 * s3 + (s1 - 1) * s4 - s2 - ((s2 ceildiv s4) * s4 - s2) + 1, d0 * s3 + (s1 - 1) * s4 - s2 - (d0 * s3 - s2) + 1)>
+
+// CHECK-DAG: #[[AFFINE_MAP1:.+]] = affine_map<(d0)[s0, s1, s2, s3] -> ((d0 + s1 - (s0 - 1) * s3 - 1) floordiv s2 + 1)>
+
+func @test_pool_general_computation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
+  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_pool_general_computation
+
+  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
+  // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
+
+  // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
+  // CHECK: krnl.iterate([[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
+
+  // CHECK:   affine.store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+
+  // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
+  // CHECK:   krnl.iterate([[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #{{.*}}(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #{{.*}}(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
+  // CHECK:     {{.*}} = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
+  // CHECK:     {{.*}} = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:     affine.store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:   }
+
+  // CHECK:   {{.*}} = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:   affine.store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK: }
+}
+
+func @test_pool_unknown_dimensions(%arg0 : tensor<1x3x?x32xf32>) -> tensor<*xf32> {
+  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x?x32xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_pool_unknown_dimensions
+  // CHECK: [[C0:%.+]] = constant 2 : index
+  // CHECK: [[DIM:%.+]] = dim %arg0, [[C0]] : memref<1x3x?x32xf32>
+  // CHECK: [[KERNEL:%.+]] = constant 2 : index
+  // CHECK: [[PAD:%.+]] = constant 0 : index
+  // CHECK: [[STRIDE:%.+]] = constant 1 : index
+  // CHECK: [[DILATION:%.+]] = constant 1 : index
+  // CHECK: [[AFFINE_APPLY:%.+]] = affine.apply #[[AFFINE_MAP1]]([[DIM]]){{.*}}[[KERNEL]], [[PAD]], [[STRIDE]], [[DILATION]]{{.*}}
+  // CHECK: [[RES:%.+]] = alloc([[AFFINE_APPLY]]) : memref<1x3x?x31xf32>
+}
+
+func @test_averagepool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
+  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_averagepool_identity_value
+  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
+  // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
+  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+}
+
+func @test_maxpool_identity_value(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
+  %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_maxpool_identity_value
+  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
+  // CHECK: [[IDENTITY:%.+]] = constant 0xFF800000 : f32
+  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+}
+
+func @test_averagepool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
+  %0 = "onnx.AveragePool"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_averagepool_pooling_operation
+  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
+
+  // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
+  // CHECK: krnl.iterate([[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
+
+  // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
+  // CHECK:   krnl.iterate([[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #{{.*}}(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #{{.*}}(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
+
+  // CHECK:     [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
+  // CHECK:     [[OUTPUT_LOAD:%.+]] = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:     [[SUM:%.+]] = addf [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
+  // CHECK:     affine.store [[SUM]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:   }
+
+  // CHECK:   [[NUMERATOR:%.+]] = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:   [[AVERAGE:%.+]] = divf [[NUMERATOR]], {{.*}} : f32
+  // CHECK:   affine.store [[AVERAGE]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK: }
+}
+
+// -----
+
+func @test_maxpool_pooling_operation(%arg0 : tensor<1x3x32x32xf32>) -> tensor<*xf32> {
+  %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", kernel_shape = [2, 2]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_maxpool_pooling_operation
+  // CHECK: [[RES:%.+]] = alloc() : memref<1x3x31x31xf32>
+
+  // CHECK: [[OUTPUT_LOOPS:%.+]]:4 = krnl.define_loops 4
+  // CHECK: krnl.iterate([[OUTPUT_LOOPS]]#0, [[OUTPUT_LOOPS]]#1, [[OUTPUT_LOOPS]]#2, [[OUTPUT_LOOPS]]#3) with ([[OUTPUT_LOOPS]]#0 -> %arg1 = 0 to 1, [[OUTPUT_LOOPS]]#1 -> %arg2 = 0 to 3, [[OUTPUT_LOOPS]]#2 -> %arg3 = 0 to 31, [[OUTPUT_LOOPS]]#3 -> %arg4 = 0 to 31) {
+
+  // CHECK:   [[POOL_LOOPS:%.+]]:2 = krnl.define_loops 2
+  // CHECK:   krnl.iterate([[POOL_LOOPS]]#0, [[POOL_LOOPS]]#1) with ([[POOL_LOOPS]]#0 -> %arg5 = 0 to min #{{.*}}(%arg3)[%c32, %c2, %c0, %c1, %c1_0], [[POOL_LOOPS]]#1 -> %arg6 = 0 to min #{{.*}}(%arg4)[%c32_1, %c2_2, %c0_3, %c1_4, %c1_5]) {
+
+  // CHECK:     [[INPUT_LOAD:%.+]] = load %arg0[%arg1, %arg2, {{.*}}, {{.*}}] : memref<1x3x32x32xf32>
+  // CHECK:     [[OUTPUT_LOAD:%.+]] = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:     [[GREATER:%.+]] = cmpf "ogt", [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
+  // CHECK:     [[SELECT:%.+]] = select [[GREATER]], [[OUTPUT_LOAD]], [[INPUT_LOAD]] : f32
+  // CHECK:     affine.store [[SELECT]], [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK:   }
+
+  // CHECK-NOT:   {{.*}} = affine.load [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK-NOT:   affine.store {{.*}}, [[RES]][%arg1, %arg2, %arg3, %arg4] : memref<1x3x31x31xf32>
+  // CHECK: }
+}
diff --git a/test/mlir/onnx/onnx_lowering_reductions.mlir b/test/mlir/onnx/onnx_lowering_reductions.mlir
new file mode 100644
index 0000000..99d0a6c
--- /dev/null
+++ b/test/mlir/onnx/onnx_lowering_reductions.mlir
@@ -0,0 +1,93 @@
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend %s -split-input-file | FileCheck %s
+
+func @test_reducemax(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
+  %0 ="onnx.ReduceMax"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_reducemax
+  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
+  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
+  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
+  // CHECK: [[IDENTITY:%.+]] = constant 0xFF800000 : f32
+  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
+
+  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: [[CMP:%.+]] = cmpf "ogt", [[LOAD2]], [[LOAD1]] : f32
+  // CHECK: [[SELECT:%.+]] = select [[CMP]], [[LOAD2]], [[LOAD1]] : f32
+  // CHECK: store [[SELECT]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: }
+  // CHECK: return [[RES]] : memref<3x2xf32>
+}
+
+// -----
+
+func @test_reducemin(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
+  %0 ="onnx.ReduceMin"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_reducemin
+  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
+  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
+  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
+  // CHECK: [[IDENTITY:%.+]] = constant 0x7F800000 : f32
+  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
+
+  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: [[CMP:%.+]] = cmpf "olt", [[LOAD2]], [[LOAD1]] : f32
+  // CHECK: [[SELECT:%.+]] = select [[CMP]], [[LOAD2]], [[LOAD1]] : f32
+  // CHECK: affine.store [[SELECT]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: }
+  // CHECK: return [[RES]] : memref<3x2xf32>
+}
+
+// -----
+
+func @test_reduceprod(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
+  %0 ="onnx.ReduceProd"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_reduceprod
+  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
+  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
+  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
+  // CHECK: [[IDENTITY:%.+]] = constant 1.000000e+00 : f32
+  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
+
+  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: [[REDUCE:%.+]] = mulf [[LOAD2]], [[LOAD1]] : f32
+  // CHECK: affine.store [[REDUCE]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: }
+  // CHECK: return [[RES]] : memref<3x2xf32>
+}
+
+// -----
+
+func @test_reducesum(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
+  %0 ="onnx.ReduceSum"(%arg0) {axes=[1], keepdims = 0 : i64} : (tensor<3x2x2xf32>)-> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_reducesum
+  // CHECK: [[RES:%.+]] = alloc() : memref<3x2xf32>
+  // CHECK: [[DEF_LOOPS1:%.+]]:2 = krnl.define_loops 2
+  // CHECK: krnl.iterate([[DEF_LOOPS1]]#0, [[DEF_LOOPS1]]#1) with ([[DEF_LOOPS1]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS1]]#1 -> %arg2 = 0 to 2) {
+  // CHECK: [[IDENTITY:%.+]] = constant 0.000000e+00 : f32
+  // CHECK: affine.store [[IDENTITY]], [[RES]][%arg1, %arg2] : memref<3x2xf32>
+
+  // CHECK: [[DEF_LOOPS2:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOPS2]]#0, [[DEF_LOOPS2]]#1, [[DEF_LOOPS2]]#2) with ([[DEF_LOOPS2]]#0 -> %arg1 = 0 to 3, [[DEF_LOOPS2]]#1 -> %arg2 = 0 to 2, [[DEF_LOOPS2]]#2 -> %arg3 = 0 to 2) {
+  // CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<3x2x2xf32>
+  // CHECK: [[LOAD2:%.+]] = affine.load %0[%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: [[REDUCE:%.+]] = addf [[LOAD2]], [[LOAD1]] : f32
+  // CHECK: affine.store [[REDUCE]], [[RES]][%arg1, %arg3] : memref<3x2xf32>
+  // CHECK: }
+  // CHECK: return [[RES]] : memref<3x2xf32>
+}
diff --git a/test/mlir/onnx/onnx_lowering_split.mlir b/test/mlir/onnx/onnx_lowering_split.mlir
new file mode 100644
index 0000000..e393ac1
--- /dev/null
+++ b/test/mlir/onnx/onnx_lowering_split.mlir
@@ -0,0 +1,85 @@
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend %s -split-input-file | FileCheck %s
+
+func @test_split_equal(%arg0 : tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %0, %1 = "onnx.Split"(%arg0) { axis = 0} : (tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+  "std.return"(%0, %1) : (tensor<*xf32>, tensor<*xf32>) -> ()
+
+  // CHECK: [[INDEX_MAP1:#.+]] = affine_map<(d0) -> (d0 + 8)>
+
+  // CHECK-LABEL: @test_split_equal
+
+  // CHECK: [[RES_1:%.+]] = alloc() : memref<8x32x64xf32>
+  // CHECK: [[RES_0:%.+]] = alloc() : memref<8x32x64xf32>
+  // CHECK: [[DEF_LOOP_0:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOP_0]]#0, [[DEF_LOOP_0]]#1, [[DEF_LOOP_0]]#2) with ([[DEF_LOOP_0]]#0 -> %arg1 = 0 to 8, [[DEF_LOOP_0]]#1 -> %arg2 = 0 to 32, [[DEF_LOOP_0]]#2 -> %arg3 = 0 to 64) {
+  // CHECK:   [[LOAD_0:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<16x32x64xf32>
+  // CHECK:   affine.store [[LOAD_0]], [[RES_0]][%arg1, %arg2, %arg3] : memref<8x32x64xf32>
+  // CHECK: }
+  // CHECK: [[DEF_LOOP_1:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOP_1]]#0, [[DEF_LOOP_1]]#1, [[DEF_LOOP_1]]#2) with ([[DEF_LOOP_1]]#0 -> %arg1 = 0 to 8, [[DEF_LOOP_1]]#1 -> %arg2 = 0 to 32, [[DEF_LOOP_1]]#2 -> %arg3 = 0 to 64) {
+  // CHECK:   %[[INDEX:.+]] = affine.apply [[INDEX_MAP1]](%arg1)
+  // CHECK:   [[LOAD_1:%.+]] = affine.load %arg0[%[[INDEX]], %arg2, %arg3] : memref<16x32x64xf32>
+  // CHECK:   affine.store [[LOAD_1]], [[RES_1]][%arg1, %arg2, %arg3] : memref<8x32x64xf32>
+  // CHECK: }
+  // CHECK: return [[RES_0]], [[RES_1]] : memref<8x32x64xf32>, memref<8x32x64xf32>
+}
+
+// -----
+
+func @test_split_variable(%arg0 : tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %0, %1 = "onnx.Split"(%arg0) { axis = 1, split = [2, 30]} : (tensor<16x32x64xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+  "std.return"(%0, %1) : (tensor<*xf32>, tensor<*xf32>) -> ()
+
+  // CHECK: [[INDEX_MAP2:#.+]] = affine_map<(d0) -> (d0 + 2)>
+
+  // CHECK-LABEL: @test_split_variable
+
+  // CHECK: [[RES_1:%.+]] = alloc() : memref<16x30x64xf32>
+  // CHECK: [[RES_0:%.+]] = alloc() : memref<16x2x64xf32>
+  // CHECK: [[DEF_LOOP_0:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOP_0]]#0, [[DEF_LOOP_0]]#1, [[DEF_LOOP_0]]#2) with ([[DEF_LOOP_0]]#0 -> %arg1 = 0 to 16, [[DEF_LOOP_0]]#1 -> %arg2 = 0 to 2, [[DEF_LOOP_0]]#2 -> %arg3 = 0 to 64) {
+  // CHECK:   [[LOAD_0:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<16x32x64xf32>
+  // CHECK:   affine.store [[LOAD_0]], [[RES_0]][%arg1, %arg2, %arg3] : memref<16x2x64xf32>
+  // CHECK: }
+  // CHECK: [[DEF_LOOP_1:%.+]]:3 = krnl.define_loops 3
+  // CHECK: krnl.iterate([[DEF_LOOP_1]]#0, [[DEF_LOOP_1]]#1, [[DEF_LOOP_1]]#2) with ([[DEF_LOOP_1]]#0 -> %arg1 = 0 to 16, [[DEF_LOOP_1]]#1 -> %arg2 = 0 to 30, [[DEF_LOOP_1]]#2 -> %arg3 = 0 to 64) {
+  // CHECK:   %[[INDEX:.+]] = affine.apply [[INDEX_MAP2]](%arg2)
+  // CHECK:   [[LOAD_1:%.+]] = affine.load %arg0[%arg1, %[[INDEX]], %arg3] : memref<16x32x64xf32>
+  // CHECK:   affine.store [[LOAD_1]], [[RES_1]][%arg1, %arg2, %arg3] : memref<16x30x64xf32>
+  // CHECK: }
+  // CHECK: return [[RES_0]], [[RES_1]] : memref<16x2x64xf32>, memref<16x30x64xf32>
+}
+
+// -----
+
+func @test_split_unknown_dimension(%arg0 : tensor<?x?x64xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %0, %1 = "onnx.Split"(%arg0) { axis = 1, split = [2, 30]} : (tensor<?x?x64xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+  "std.return"(%0, %1) : (tensor<*xf32>, tensor<*xf32>) -> ()
+
+  // CHECK: [[INDEX_MAP3:#.+]] = affine_map<(d0) -> (d0 + 2)>
+
+  // CHECK-LABEL: @test_split_unknown_dimension
+
+  // CHECK: [[C0:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x?x64xf32>
+  // CHECK: [[RES_0:%.+]] = alloc([[DIM_0]]) : memref<?x2x64xf32>
+  // CHECK: [[C0_0:%.+]] = constant 0 : index
+  // CHECK: [[DIM_1:%.+]] = dim %arg0, [[C0_0]] : memref<?x?x64xf32>
+  // CHECK: [[RES_1:%.+]] = alloc([[DIM_1]]) : memref<?x30x64xf32>
+  // CHECK: [[DEF_LOOP_0:%.+]]:3 = krnl.define_loops 3
+  // CHECK: [[C0_2:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES_0]], [[C0_2]] : memref<?x2x64xf32>
+  // CHECK: krnl.iterate([[DEF_LOOP_0]]#0, [[DEF_LOOP_0]]#1, [[DEF_LOOP_0]]#2) with ([[DEF_LOOP_0]]#0 -> %arg1 = 0 to [[DIM_0]], [[DEF_LOOP_0]]#1 -> %arg2 = 0 to 2, [[DEF_LOOP_0]]#2 -> %arg3 = 0 to 64) {
+  // CHECK:   [[LOAD_0:%.+]] = affine.load %arg0[%arg1, %arg2, %arg3] : memref<?x?x64xf32>
+  // CHECK:   affine.store [[LOAD_0]], [[RES_0]][%arg1, %arg2, %arg3] : memref<?x2x64xf32>
+  // CHECK: }
+  // CHECK: [[DEF_LOOP_1:%.+]]:3 = krnl.define_loops 3
+  // CHECK: [[C0_3:%.+]] = constant 0 : index
+  // CHECK: [[DIM_1:%.+]] = dim [[RES_1]], [[C0_3]] : memref<?x30x64xf32>
+  // CHECK: krnl.iterate([[DEF_LOOP_1]]#0, [[DEF_LOOP_1]]#1, [[DEF_LOOP_1]]#2) with ([[DEF_LOOP_1]]#0 -> %arg1 = 0 to [[DIM_1]], [[DEF_LOOP_1]]#1 -> %arg2 = 0 to 30, [[DEF_LOOP_1]]#2 -> %arg3 = 0 to 64) {
+  // CHECK:   %[[INDEX:.+]] = affine.apply [[INDEX_MAP3]](%arg2)
+  // CHECK:   [[LOAD_1:%.+]] = affine.load %arg0[%arg1, %[[INDEX]], %arg3] : memref<?x?x64xf32>
+  // CHECK:   affine.store [[LOAD_1]], [[RES_1]][%arg1, %arg2, %arg3] : memref<?x30x64xf32>
+  // CHECK: }
+  // CHECK: return [[RES_0]], [[RES_1]] : memref<?x2x64xf32>, memref<?x30x64xf32>
+}
diff --git a/test/mlir/onnx/onnx_lowering_squeeze.mlir b/test/mlir/onnx/onnx_lowering_squeeze.mlir
new file mode 100644
index 0000000..d0f49e7
--- /dev/null
+++ b/test/mlir/onnx/onnx_lowering_squeeze.mlir
@@ -0,0 +1,29 @@
+// RUN: onnx-mlir-opt --shape-inference --lower-frontend %s -split-input-file | FileCheck %s
+
+func @test_squeeze(%arg0 : tensor<16x1x32x1x64xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Squeeze"(%arg0) { axes = [1, -2]} : (tensor<16x1x32x1x64xf32>) -> (tensor<*xf32>)
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_squeeze
+  // CHECK: [[RES:%.+]] = alloc() : memref<16x32x64xf32>
+  // CHECK: [[TENSOR_SIZE:%.+]] = constant 131072 : i64
+  // CHECK: "krnl.memcpy"([[RES]], %arg0, [[TENSOR_SIZE]]) : (memref<16x32x64xf32>, memref<16x1x32x1x64xf32>, i64) -> ()
+  // CHECK: return [[RES]] : memref<16x32x64xf32>
+}
+
+// -----
+
+func @test_squeeze_unknown_dimensions(%arg0 : tensor<?x1x32x?x64xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Squeeze"(%arg0) { axes = [1,-2]} : (tensor<?x1x32x?x64xf32>) -> (tensor<*xf32>)
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: @test_squeeze_unknown_dimensions
+  // CHECK: [[C0:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x1x32x?x64xf32>
+  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x32x64xf32>
+  // CHECK: [[TENSOR_SIZE_0:%.+]] = constant 8192 : i64
+  // CHECK: [[DIM_0_i64:%.+]] = index_cast [[DIM_0]] : index to i64
+  // CHECK: [[TENSOR_SIZE_1:%.+]] = muli [[TENSOR_SIZE_0]], [[DIM_0_i64]] : i64
+  // CHECK: "krnl.memcpy"([[RES]], %arg0, [[TENSOR_SIZE_1]]) : (memref<?x32x64xf32>, memref<?x1x32x?x64xf32>, i64) -> ()
+  // CHECK: return [[RES]] : memref<?x32x64xf32>
+}
diff --git a/test/mlir/onnx/onnx_lowering_with_dealloc.mlir b/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
index ccf653b..64e61a6 100644
--- a/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
+++ b/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
@@ -239,10 +239,15 @@ func @test_exp_exp(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_exp_exp
-  /// First Exp
+
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Exp
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -252,9 +257,6 @@ func @test_exp_exp(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[EXP]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
   
   /// Second Exp
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -278,10 +280,14 @@ func @test_tanh_tanh(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_tanh_tanh
-  /// First Tanh
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Tanh
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -297,9 +303,6 @@ func @test_tanh_tanh(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[TANH]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
   
   /// Second Tanh
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -329,10 +332,14 @@ func @test_sinh_sinh(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_sinh_sinh
-  /// First Sinh
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_0:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_0]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Sinh
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -348,9 +355,6 @@ func @test_sinh_sinh(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[SINH_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
   
   /// Second Sinh
-  // CHECK: [[C0_0:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_0]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -380,10 +384,14 @@ func @test_cosh_cosh(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_cosh_cosh
-  /// First Cosh
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Cosh
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -399,9 +407,6 @@ func @test_cosh_cosh(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[COSH_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
   
   /// Second Cosh
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -430,10 +435,14 @@ func @test_sigmoid_sigmoid(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_sigmoid_sigmoid
-  /// First Sigmoid
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Sigmoid
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -448,9 +457,6 @@ func @test_sigmoid_sigmoid(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[SIGMOID_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
   
   /// Second Sigmoid
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -479,10 +485,14 @@ func @test_relu_relu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_relu_relu
-  /// First Relu
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Relu
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -494,9 +504,6 @@ func @test_relu_relu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[RELU_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
 
   /// Second Relu
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -625,10 +632,14 @@ func @test_elu_elu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_elu_elu
-  /// First Elu
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Elu
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -645,9 +656,6 @@ func @test_elu_elu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[SELECT]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
 
   /// Second Elu
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -678,10 +686,14 @@ func @test_leakyrelu_leakyrelu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_leakyrelu_leakyrelu
-  /// First LeakyRelu
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First LeakyRelu
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -695,9 +707,6 @@ func @test_leakyrelu_leakyrelu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[SELECT]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
 
   /// Second LeakyRelu
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -725,10 +734,14 @@ func @test_selu_selu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_selu_selu
-  /// First Selu
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Selu
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -746,9 +759,6 @@ func @test_selu_selu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[SELU_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
 
   /// Second Selu
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -780,10 +790,14 @@ func @test_hardsigmoid_hardsigmoid(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_hardsigmoid_hardsigmoid
-  /// First HardSigmoid
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First HardSigmoid
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -802,9 +816,6 @@ func @test_hardsigmoid_hardsigmoid(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[SELECT2]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
   
   /// Second HardSigmoid
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
@@ -837,10 +848,14 @@ func @test_reciprocal_reciprocal(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   "std.return"(%1) : (tensor<*xf32>) -> ()
 
   // CHECK-LABEL: test_reciprocal_reciprocal
-  /// First Reciprocal
   // CHECK: [[C0:%.+]] = constant 0 : index
   // CHECK: [[DIM_0:%.+]] = dim %arg0, [[C0]] : memref<?x10xf32>
   // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[C0_1:%.+]] = constant 0 : index
+  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+
+  /// First Reciprocal
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_0:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim %arg0, [[C0_0]] : memref<?x10xf32>
@@ -851,9 +866,6 @@ func @test_reciprocal_reciprocal(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   // CHECK: affine.store [[RECIPROCAL_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
 
   /// Second Reciprocal
-  // CHECK: [[C0_1:%.+]] = constant 0 : index
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], [[C0_1]] : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
   // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
   // CHECK: [[C0_2:%.+]] = constant 0 : index
   // CHECK: [[DIM_2:%.+]] = dim [[RES]], [[C0_2]] : memref<?x10xf32>
diff --git a/test/mlir/onnx/onnx_structure.mlir b/test/mlir/onnx/onnx_structure.mlir
index c295171..c8ce47c 100644
--- a/test/mlir/onnx/onnx_structure.mlir
+++ b/test/mlir/onnx/onnx_structure.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // CHECK-LABEL: @check_map1(%arg0: tuple<i64, f32>) -> tensor<*xf32> {