[MLIR] Add broadcasting support for element wise operations (#398)

* Add broadcasting support for elementwise operations * Remove MLIRDialect from MLIRWholeArchiveLibs * Rewrite getLoopIVsForBroadcasting * Compute dimensions for allocating result memory * Compute dimensions for allocating result memory (revised) * Use static dimension for element-wise operation testcases * Add a test for addition with broadcasting * Missed Traits.h when merging * Revise * Update SharedWork.md * Broadcasting for variadic operations * Edit comments * Update SharedWork.md * Reorganize the code * Add CHECK-LABEL for test_add_with_broadcasting
2019-12-20 01:28:06 +09:00 · 2019-12-20 01:28:06 +09:00 · 06a968d4a1
parent 0a8af69e94
commit 06a968d4a1
6 changed files with 501 additions and 341 deletions
--- a/MLIR.cmake
+++ b/MLIR.cmake
@ -57,6 +57,7 @@ endfunction(find_mlir_lib)
 find_mlir_lib(MLIRAffineOps)
 find_mlir_lib(MLIRAffineToStandard)
 find_mlir_lib(MLIRAnalysis)
+find_mlir_lib(MLIRDialect)
 find_mlir_lib(MLIRExecutionEngine)
 find_mlir_lib(MLIRIR)
 find_mlir_lib(MLIRLLVMIR)
@ -114,6 +115,7 @@ set(MLIRLibsOnce
        MLIRAffineOps
        MLIRAffineToStandard
        MLIRAnalysis
+        MLIRDialect
        MLIRExecutionEngine
        MLIRIR
        MLIRLLVMIR
--- a/SharingWork.md
+++ b/SharingWork.md
@ -8,10 +8,10 @@ ONNX operations for which some work is needed.

 | ONNX Oper   | Person working on it  | ONNX 2 KRNL    | Basic functionality   | Extended functionality (e.g. broadcast)  |
 | ----------  | --------------------- | -------------- | --------------------- | ---------------------------------------- |
-| Add         | Tung (updated)        | v              | v                     | noM                                      |
-| And         | Tung                  | v              | v                     | noM                                      |
-| Cosh        | Tung                  | v              | v                     | noM                                      |
-| Div         | Tung                  | v              | v                     |                                          |
+| Add         | Tung (updated)        | v              | v                     | M                                        |
+| And         | Tung                  | v              | v                     | M                                        |
+| Cosh        | Tung                  | v              | v                     |                                          |
+| Div         | Tung                  | v              | v                     | M                                        |
 | Elu         | Tung                  | v              | v                     |                                          |
 | Exp         | Tung                  | v              | v                     |                                          |
 | FullGemm    |                       |                |                       | noU                                      |
@ -19,18 +19,18 @@ ONNX operations for which some work is needed.
 | HardSigmoid | Tung                  | v              | v                     |                                          |
 | LeakyRelu   | Tung                  | v              | v                     |                                          |
 | MatMul      |                       |                |                       | noM                                      |
-| Max         | Tung                  | v              | v                     | noM                                      |
-| Min         | Tung                  | v              | v                     | noM                                      |
-| Mul         | Tung                  | v              | v                     | noM                                      |
-| Or          | Tung                  | v              | v                     | noM                                      |
+| Max         | Tung                  | v              | v                     | M                                        |
+| Min         | Tung                  | v              | v                     | M                                        |
+| Mul         | Tung                  | v              | v                     | M                                        |
+| Or          | Tung                  | v              | v                     | M                                        |
 | Relu        | Tung                  | v              | v                     |                                          |
 | Selu        | Tung                  | v              | v                     |                                          |
 | Sigmoid     | Tung                  | v              | v                     |                                          |
 | Sinh        | Tung                  | v              | v                     |                                          |
-| Sub         | Tung                  | v              | v                     | noM                                      |
-| Sum         | Tung                  | v              | v                     | noM                                      |
+| Sub         | Tung                  | v              | v                     | M                                        |
+| Sum         | Tung                  | v              | v                     | M                                        |
 | Tanh        | Tung                  | v              | v                     |                                          |
-| Xor         | Tung                  | v              | v                     | noM                                      |
+| Xor         | Tung                  | v              | v                     | M                                        |


 ONNX operations for which the work is completed (full functionality) and tested
--- a/src/compiler/dialect/onnx/onnx_ops.cpp
+++ b/src/compiler/dialect/onnx/onnx_ops.cpp
@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//

+#include "mlir/Dialect/Traits.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"
@ -21,6 +22,7 @@
 #include "onnx_ops.hpp"

 using namespace mlir;
+using namespace mlir::OpTrait::util;

 //===----------------------------------------------------------------------===//
 // ONNXOpsDialect
@ -127,7 +129,12 @@ void ONNXReciprocalOp::inferShapes() {
 /// Infer the output shape of the ONNXAddOp. This method is required by the
 /// shape inference interface.
 void ONNXAddOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
 }

 //===----------------------------------------------------------------------===//
@ -135,7 +142,12 @@ void ONNXAddOp::inferShapes() {
 /// Infer the output shape of the ONNXMulOp. This method is required by the
 /// shape inference interface.
 void ONNXMulOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
 }

 //===----------------------------------------------------------------------===//
@ -143,7 +155,12 @@ void ONNXMulOp::inferShapes() {
 /// Infer the output shape of the ONNXDivOp. This method is required by the
 /// shape inference interface.
 void ONNXDivOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
 }

 //===----------------------------------------------------------------------===//
@ -151,7 +168,12 @@ void ONNXDivOp::inferShapes() {
 /// Infer the output shape of the ONNXSubOp. This method is required by the
 /// shape inference interface.
 void ONNXSubOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
 }

 //===----------------------------------------------------------------------===//
@ -159,21 +181,38 @@ void ONNXSubOp::inferShapes() {
 /// Infer the output shape of the ONNXAndOp. This method is required by the
 /// shape inference interface.
 void ONNXAndOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
 }

 //===----------------------------------------------------------------------===//
 // Or
 /// Infer the output shape of the ONNXOrOp. This method is required by the
 /// shape inference interface.
-void ONNXOrOp::inferShapes() { getResult()->setType(getOperand(0)->getType()); }
+void ONNXOrOp::inferShapes() {
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
+}

 //===----------------------------------------------------------------------===//
 // Xor
 /// Infer the output shape of the ONNXXorOp. This method is required by the
 /// shape inference interface.
 void ONNXXorOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  if (!getOperand(0)->getType().isa<RankedTensorType>() ||
+      !getOperand(1)->getType().isa<RankedTensorType>())
+    return;
+  auto lhsTy = getOperand(0)->getType().cast<RankedTensorType>();
+  auto rhsTy = getOperand(1)->getType().cast<RankedTensorType>();
+  getResult()->setType(getBroadcastedType(lhsTy, rhsTy));
 }

 //===----------------------------------------------------------------------===//
@ -183,7 +222,16 @@ void ONNXXorOp::inferShapes() {
 /// Infer the output shape of the ONNXSumOp. This method is required by the
 /// shape inference interface.
 void ONNXSumOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  for (int i = 0; i < getNumOperands(); ++i) {
+    if (!getOperand(i)->getType().cast<RankedTensorType>())
+      return;
+  }
+  Type resultTy = getOperand(0)->getType().cast<RankedTensorType>();
+  for (int i = 1; i < getNumOperands(); ++i) {
+    Type nextTy = getOperand(i)->getType().cast<RankedTensorType>();
+    resultTy = getBroadcastedType(resultTy, nextTy);
+  }
+  getResult()->setType(resultTy);
 }

 //===----------------------------------------------------------------------===//
@ -191,7 +239,16 @@ void ONNXSumOp::inferShapes() {
 /// Infer the output shape of the ONNXMaxOp. This method is required by the
 /// shape inference interface.
 void ONNXMaxOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  for (int i = 0; i < getNumOperands(); ++i) {
+    if (!getOperand(i)->getType().cast<RankedTensorType>())
+      return;
+  }
+  Type resultTy = getOperand(0)->getType().cast<RankedTensorType>();
+  for (int i = 1; i < getNumOperands(); ++i) {
+    Type nextTy = getOperand(i)->getType().cast<RankedTensorType>();
+    resultTy = getBroadcastedType(resultTy, nextTy);
+  }
+  getResult()->setType(resultTy);
 }

 //===----------------------------------------------------------------------===//
@ -199,7 +256,16 @@ void ONNXMaxOp::inferShapes() {
 /// Infer the output shape of the ONNXMinOp. This method is required by the
 /// shape inference interface.
 void ONNXMinOp::inferShapes() {
-  getResult()->setType(getOperand(0)->getType());
+  for (int i = 0; i < getNumOperands(); ++i) {
+    if (!getOperand(i)->getType().cast<RankedTensorType>())
+      return;
+  }
+  Type resultTy = getOperand(0)->getType().cast<RankedTensorType>();
+  for (int i = 1; i < getNumOperands(); ++i) {
+    Type nextTy = getOperand(i)->getType().cast<RankedTensorType>();
+    resultTy = getBroadcastedType(resultTy, nextTy);
+  }
+  getResult()->setType(resultTy);
 }

 //===----------------------------------------------------------------------===//
--- a/src/compiler/pass/lower_frontend_to_krnl.cpp
+++ b/src/compiler/pass/lower_frontend_to_krnl.cpp
@ -9,6 +9,8 @@
 //
 //===----------------------------------------------------------------------===//

+#include <map>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Sequence.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
@ -44,16 +46,51 @@ static MemRefType convertTensorToMemRef(TensorType type) {
 }

 /// Insert an allocation and deallocation for the given MemRefType.
-static Value* insertAllocAndDealloc(MemRefType type, Location loc,
-    PatternRewriter& rewriter, bool insertDealloc, Value* oldMemRef = nullptr) {
+static Value *insertAllocAndDealloc(MemRefType type, Location loc,
+                                    PatternRewriter &rewriter,
+                                    bool insertDealloc,
+                                    ArrayRef<Value *> operands = {}) {
  // Put together alloc operands for any dynamic dimensions of the memref.
  AllocOp alloc;
-  if (oldMemRef) {
-    SmallVector<Value*, 4> allocOperands;
+  if (!operands.empty()) {
    auto memRefShape = type.getShape();
-    for (int i = 0; i < memRefShape.size(); ++i)
+    auto rank = memRefShape.size();
+
+    std::map<int, Value *> fromOperands;
+    for (int reversedIdx = 0; reversedIdx < rank; ++reversedIdx) {
+      int memRefDimIdx = rank - 1 - reversedIdx;
+      if (memRefShape[memRefDimIdx] < 0) { // unknown dimension
+        Value *maxDim = nullptr;
+        for (int i = 0; i < operands.size(); i++) {
+          auto operandShape =
+              operands[i]->getType().cast<MemRefType>().getShape();
+          int operandDimIdx = operandShape.size() - 1 - reversedIdx;
+
+          if (operandDimIdx < 0)
+            continue;
+
+          // In case of operations with broadcasting, the dimension of the
+          // alloc result is the maximum size along each dimension of the
+          // operands.
+          auto operandDim =
+              rewriter.create<DimOp>(loc, operands[i], operandDimIdx);
+          if (maxDim) {
+            auto maxCondition = rewriter.create<CmpIOp>(loc, CmpIPredicate::sgt,
+                                                        operandDim, maxDim);
+            maxDim = rewriter.create<SelectOp>(loc, maxCondition, operandDim,
+                                               maxDim);
+          } else {
+            maxDim = operandDim;
+          }
+        }
+        fromOperands.insert(std::make_pair(memRefDimIdx, maxDim));
+      }
+    }
+
+    SmallVector<Value *, 4> allocOperands;
+    for (int i = 0; i < rank; ++i)
      if (memRefShape[i] < 0)
-        allocOperands.push_back(rewriter.create<DimOp>(loc, oldMemRef, i));
+        allocOperands.push_back(fromOperands[i]);
    alloc = rewriter.create<AllocOp>(loc, type, allocOperands);
  } else {
    alloc = rewriter.create<AllocOp>(loc, type);
@ -109,6 +146,89 @@ unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
  return llvm::divideCeil(sizeInBits, 8);
 }

+// Get run-time dimension information for unknown dimensions used for
+// broadcasting.
+std::map<int, std::map<int, Value *> >
+getBroadcastedDimInfo(Location loc, ConversionPatternRewriter &rewriter,
+                      MemRefType memRefType, ArrayRef<Value *> operands) {
+  auto memRefShape = memRefType.getShape();
+  int64_t rank = memRefShape.size();
+  // For unknown dimensions, we need to get dimension values at runtime in
+  // order to do broadcasting.
+  std::map<int, std::map<int, Value *>> DimInfo;
+  // For each result dimension, compute the number of sharing operands.
+  // Sharing operands are operands sharing the same index (counting from the
+  // rightmost to the leftmost) for a given dimension.
+  std::map<int, int> sharedDimCount;
+  for (int reversedIdx = 0; reversedIdx < rank; ++reversedIdx) {
+    int dimIdx = rank - 1 - reversedIdx;
+    sharedDimCount[dimIdx] = 0;
+    for (int i = 0; i < operands.size(); ++i) {
+      auto shape = operands[i]->getType().cast<MemRefType>().getShape();
+      if (reversedIdx <= shape.size() - 1)
+        sharedDimCount[dimIdx]++;
+    }
+  }
+  // An unknown dimension can have a value of 1 or N (N > 1).
+  // If its value is 1, it is broadcasted dimension.
+  // Otherwise, non-broadcasted dimension.
+  // We only care about unknown dimensions whose number of sharing operands is
+  // more than one, since they are potentially broadcasted dimensions.
+  for (int i = 0; i < operands.size(); ++i) {
+    std::map<int, Value *> broadcastedDims;
+    auto shape = operands[i]->getType().cast<MemRefType>().getShape();
+    int size = shape.size();
+    for (int j = 0; j < shape.size(); ++j) {
+      if (shape[j] < 0 and sharedDimCount[rank - size + j] > 1) {
+        auto dim = rewriter.create<DimOp>(loc, operands[i], j).getResult();
+        auto one = rewriter.create<ConstantIndexOp>(loc, 1);
+        auto isBroadcasted =
+            rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, dim, one);
+        broadcastedDims.insert(std::make_pair(j, isBroadcasted));
+      }
+    }
+    DimInfo.insert(std::make_pair(i, broadcastedDims));
+  }
+  return DimInfo;
+}
+
+// Extract induction variables that are used for broadcasting values of a
+// given operand.
+std::vector<Value *>
+getLoopIVsForBroadcasting(Location loc, ConversionPatternRewriter &rewriter,
+                           ArrayRef<Value *> loopIVs, Value *operand,
+                           std::map<int, Value *> broadcastedDims) {
+  // `operand` must has a ranked type. This should have been checked by the
+  // shape inference pass.
+  auto operandShape = operand->getType().cast<MemRefType>().getShape();
+  auto rank = operandShape.size();
+  auto loopCount = loopIVs.size();
+
+  std::vector<Value*> newLoopIVs;
+  for (unsigned reversedIdx = 0; reversedIdx < rank; ++reversedIdx) {
+    auto dimIdx = rank - 1 - reversedIdx;
+    auto loopIdx = loopCount - 1 - reversedIdx;
+    if (operandShape[dimIdx] == 1) {
+      // Broadcasted dimension
+      auto zero = rewriter.create<ConstantIndexOp>(loc, 0);
+      newLoopIVs.insert(newLoopIVs.begin(), zero);
+    } else if ((operandShape[dimIdx] == -1) &&
+               (broadcastedDims.find(dimIdx) != broadcastedDims.end())) {
+      // Unknown dimension, it can have a value of 1 or N (N > 1).
+      // If its value is 1, it is broadcasted dimension.
+      // Otherwise, non-broadcasted dimension.
+      auto zero = rewriter.create<ConstantIndexOp>(loc, 0);
+      auto idx = rewriter.create<SelectOp>(loc, broadcastedDims[dimIdx],
+                                           zero, loopIVs[loopIdx]);
+      newLoopIVs.insert(newLoopIVs.begin(), idx);
+    } else {
+      // Non-broadcasted dimension
+      newLoopIVs.insert(newLoopIVs.begin(), loopIVs[loopIdx]);
+    }
+  }
+  return newLoopIVs;
+}
+
 namespace {

 template <typename ElementwiseNaryOp>
@ -505,7 +625,7 @@ struct ONNXElementwiseUnaryOpLowering : public ConversionPattern {
      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
    else
      alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, operands[0]);
+          memRefType, loc, rewriter, insertDealloc, {operands[0]});

    // Number of loops
    auto memRefShape = memRefType.getShape();
@ -595,20 +715,18 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
    // Insert an allocation and deallocation for the result of this operation.
    auto memRefType = convertTensorToMemRef(tensorType);

-    // If the output has a dynamic dimension, pass the operands required for
-    // each dynamic dimension to the AllocOp. The first operand of the
-    // operation is used. The operands of the op need to match in terms of
-    // dimensions with the result at this pre-optimization phase.
-    // TODO: verify that dimensions match.
-    // TODO: can the dimension of the result differ after optimizations?
    Value* alloc;
    bool insertDealloc = checkInsertDealloc(op);
-
+    // If the output has a dynamic dimension, we compute its dimension at
+    // runtime by using dimensions from the operands.
+    // In particular, we need to know from which operand a result dimension
+    // comes from.
+    // TODO: can the dimension of the result differ after optimizations?
    if (hasAllConstantDimensions(memRefType))
      alloc = insertAllocAndDealloc(memRefType, loc, rewriter, insertDealloc);
    else
      alloc = insertAllocAndDealloc(
-          memRefType, loc, rewriter, insertDealloc, operands[0]);
+          memRefType, loc, rewriter, insertDealloc, operands);

    // Number of loops
    auto memRefShape = memRefType.getShape();
@ -639,13 +757,18 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
      if (memRefShape[i] < 0) {
        pack.pushConstantBound(0);
        pack.pushOperandBound(
-            rewriter.create<DimOp>(loc, operands[0], i).getResult());
+            rewriter.create<DimOp>(loc, alloc, i).getResult());
      } else {
        pack.pushConstantBound(0);
        pack.pushConstantBound(memRefShape[i]);
      }
    }

+    // Get run-time dimension information for unknown dimensions used for
+    // broadcasting.
+    std::map<int, std::map<int, Value *>> broadcastedDimInfo =
+        getBroadcastedDimInfo(loc, rewriter, memRefType, operands);
+
    auto iterateOp = rewriter.create<KrnlIterateOp>(loc, pack);
    Block& iterationBlock = iterateOp.bodyRegion().front();

@ -655,8 +778,7 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
    // 1. Insert any optimizations in the KrnlOptimizeLoopsOp body.
    rewriter.setInsertionPointToEnd(&optimizationBlock);
    // Return from KrnlOptimizeLoopsOp body.
-    // When no optimizations are present we just return the loops
-    // unchaged.
+    // When no optimizations are present we just return the loops unchaged.
    rewriter.create<KrnlReturnLoopsOp>(loc, originalLoops);
    rewriter.setInsertionPoint(optimizedLoopsOp);

@ -670,9 +792,13 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {

    // Fold over operands for each of their scalar values
    Value *accumulated, *next;
-    accumulated = rewriter.create<LoadOp>(loc, operands[0], loopIVs);
+    auto accumulatedLoopIVs = getLoopIVsForBroadcasting(
+        loc, rewriter, loopIVs, operands[0], broadcastedDimInfo[0]);
+    accumulated = rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
    for (unsigned i = 1; i < numArgs; i++) {
-      next = rewriter.create<LoadOp>(loc, operands[i], loopIVs);
+      auto nextLoopIVs = getLoopIVsForBroadcasting(
+          loc, rewriter, loopIVs, operands[i], broadcastedDimInfo[i]);
+      next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
      accumulated = mapToLowerScalarOp<ElementwiseVariadicOp>(
          op, memRefType.getElementType(), {accumulated, next}, rewriter);
    }
--- a/test/mlir/onnx/onnx_lowering.mlir
+++ b/test/mlir/onnx/onnx_lowering.mlir
@ -1,143 +1,129 @@
 // RUN: onnf-opt --shape-inference --lower-frontend %s -split-input-file | FileCheck %s

-func @test_add(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_add
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

-func @test_mul(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Mul"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Mul"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_mul
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

-func @test_div(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Div"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Div"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_div
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

-func @test_sub(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Sub"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Sub"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_sub
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

-func @test_and(%arg0 : tensor<?x10xi32>, %arg1 : tensor<?x10xi32>) -> tensor<*xi32> {
-  %0 = "onnx.And"(%arg0, %arg1) : (tensor<?x10xi32>, tensor<?x10xi32>) -> tensor<*xi32>
+func @test_and(%arg0 : tensor<10x10xi32>, %arg1 : tensor<10x10xi32>) -> tensor<*xi32> {
+  %0 = "onnx.And"(%arg0, %arg1) : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<*xi32>
  "std.return"(%0) : (tensor<*xi32>) -> ()

  // CHECK-LABEL: test_and
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: return [[RES]] : memref<?x10xi32>
+  // CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: return [[RES]] : memref<10x10xi32>
 }

-func @test_or(%arg0 : tensor<?x10xi32>, %arg1 : tensor<?x10xi32>) -> tensor<*xi32> {
-  %0 = "onnx.Or"(%arg0, %arg1) : (tensor<?x10xi32>, tensor<?x10xi32>) -> tensor<*xi32>
+func @test_or(%arg0 : tensor<10x10xi32>, %arg1 : tensor<10x10xi32>) -> tensor<*xi32> {
+  %0 = "onnx.Or"(%arg0, %arg1) : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<*xi32>
  "std.return"(%0) : (tensor<*xi32>) -> ()

  // CHECK-LABEL: test_or
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: return [[RES]] : memref<?x10xi32>
+  // CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: return [[RES]] : memref<10x10xi32>
 }

-func @test_xor(%arg0 : tensor<?x10xi32>, %arg1 : tensor<?x10xi32>) -> tensor<*xi32> {
-  %0 = "onnx.Xor"(%arg0, %arg1) : (tensor<?x10xi32>, tensor<?x10xi32>) -> tensor<*xi32>
+func @test_xor(%arg0 : tensor<10x10xi32>, %arg1 : tensor<10x10xi32>) -> tensor<*xi32> {
+  %0 = "onnx.Xor"(%arg0, %arg1) : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<*xi32>
  "std.return"(%0) : (tensor<*xi32>) -> ()

  // CHECK-LABEL: test_xor
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: return [[RES]] : memref<?x10xi32>
+  // CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: return [[RES]] : memref<10x10xi32>
 }

 func @test_exp(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
@ -310,66 +296,60 @@ func @test_reshape(%arg0 : tensor<?x10xf32>, %arg1 : tensor<4xi32>) -> tensor<*x
  // CHECK: return [[ALLOC]] : memref<?x?x?x?xf32>
 }

-func @test_sum(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Sum"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Sum"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_sum
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

-func @test_max(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Max"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Max"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_max
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

-func @test_min(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Min"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Min"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%0) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_min
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: return [[RES]] : memref<?x10xf32>
+  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: return [[RES]] : memref<10x10xf32>
 }

 func @test_elu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
@ -495,3 +475,29 @@ func @test_reciprocal(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
  // CHECK: store [[RECIPROCAL_RES]], [[RES]][%arg1, %arg2] : memref<?x10xf32>
  // CHECK: return [[RES]] : memref<?x10xf32>
 }
+
+func @test_add_with_broadcasting(%arg0 : tensor<?xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<?xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+  "std.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_add_with_broadcasting
+  // CHECK: [[DIM1:%.+]] = dim %arg1, 0 : memref<?x10xf32>
+  // CHECK: [[RES:%.+]] = alloc([[DIM1]]) : memref<?x10xf32>
+  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
+  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
+  // CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
+  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
+  // CHECK: [[DIM2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
+  // CHECK: [[DIM3:%.+]] = dim %arg0, 0 : memref<?xf32>
+  // CHECK: [[ONE:%.+]] = constant 1 : index
+  // CHECK: [[IS_ONE:%.+]] = cmpi "eq", [[DIM3]], [[ONE]] : index
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[ZERO:%.+]] = constant 0 : index
+  // CHECK: %[[SELECT1:.+]] = select [[IS_ONE]], [[ZERO]], %arg3 : index
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%[[SELECT1]]] : memref<?xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
+  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: }
+  // CHECK: return [[RES]] : memref<?x10xf32>
+}
--- a/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
+++ b/test/mlir/onnx/onnx_lowering_with_dealloc.mlir
@ -1,291 +1,263 @@
 // RUN: onnf-opt --shape-inference --lower-frontend %s -split-input-file | FileCheck %s

-func @test_add_add(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Add"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Add"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_add_add
  /// First Add
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Add
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>

-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }


-func @test_mul_mul(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Mul"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Mul"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Mul"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Mul"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_mul_mul
  /// First Mul
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Mul
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>

-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }

-func @test_div_div(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Div"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Div"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Div"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Div"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_div_div
  /// First Div
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Div
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>

-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }

-func @test_sub_sub(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Sub"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Sub"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Sub"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Sub"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_sub_sub
  /// First Sub
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Sub
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>

-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }

-func @test_and_and(%arg0 : tensor<?x10xi32>, %arg1 : tensor<?x10xi32>) -> tensor<*xi32> {
-  %0 = "onnx.And"(%arg0, %arg1) : (tensor<?x10xi32>, tensor<?x10xi32>) -> tensor<*xi32>
-  %1 = "onnx.And"(%0, %arg1) : (tensor<*xi32>, tensor<?x10xi32>) -> tensor<*xi32>
+func @test_and_and(%arg0 : tensor<10x10xi32>, %arg1 : tensor<10x10xi32>) -> tensor<*xi32> {
+  %0 = "onnx.And"(%arg0, %arg1) : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<*xi32>
+  %1 = "onnx.And"(%0, %arg1) : (tensor<*xi32>, tensor<10x10xi32>) -> tensor<*xi32>
  "std.return"(%1) : (tensor<*xi32>) -> ()

  // CHECK-LABEL: test_and_and
  /// First And
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xi32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi32>

  /// Second And
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xi32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xi32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xi32>
+  // CHECK: dealloc [[RES]] : memref<10x10xi32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xi32>

-  // CHECK: return [[RET_RES]] : memref<?x10xi32>
+  // CHECK: return [[RET_RES]] : memref<10x10xi32>
 }

-func @test_or_or(%arg0 : tensor<?x10xi32>, %arg1 : tensor<?x10xi32>) -> tensor<*xi32> {
-  %0 = "onnx.Or"(%arg0, %arg1) : (tensor<?x10xi32>, tensor<?x10xi32>) -> tensor<*xi32>
-  %1 = "onnx.Or"(%0, %arg1) : (tensor<*xi32>, tensor<?x10xi32>) -> tensor<*xi32>
+func @test_or_or(%arg0 : tensor<10x10xi32>, %arg1 : tensor<10x10xi32>) -> tensor<*xi32> {
+  %0 = "onnx.Or"(%arg0, %arg1) : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<*xi32>
+  %1 = "onnx.Or"(%0, %arg1) : (tensor<*xi32>, tensor<10x10xi32>) -> tensor<*xi32>
  "std.return"(%1) : (tensor<*xi32>) -> ()

  // CHECK-LABEL: test_or_or
  /// First Or
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xi32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi32>

  /// Second Or
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xi32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xi32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xi32>
+  // CHECK: dealloc [[RES]] : memref<10x10xi32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xi32>

-  // CHECK: return [[RET_RES]] : memref<?x10xi32>
+  // CHECK: return [[RET_RES]] : memref<10x10xi32>
 }

-func @test_xor_xor(%arg0 : tensor<?x10xi32>, %arg1 : tensor<?x10xi32>) -> tensor<*xi32> {
-  %0 = "onnx.Xor"(%arg0, %arg1) : (tensor<?x10xi32>, tensor<?x10xi32>) -> tensor<*xi32>
-  %1 = "onnx.Xor"(%0, %arg1) : (tensor<*xi32>, tensor<?x10xi32>) -> tensor<*xi32>
+func @test_xor_xor(%arg0 : tensor<10x10xi32>, %arg1 : tensor<10x10xi32>) -> tensor<*xi32> {
+  %0 = "onnx.Xor"(%arg0, %arg1) : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<*xi32>
+  %1 = "onnx.Xor"(%0, %arg1) : (tensor<*xi32>, tensor<10x10xi32>) -> tensor<*xi32>
  "std.return"(%1) : (tensor<*xi32>) -> ()

  // CHECK-LABEL: test_xor_xor
  /// First Xor
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xi32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi32>

  /// Second Xor
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xi32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xi32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xi32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xi32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi32>
  // CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i32
-  // CHECK: store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<?x10xi32>
+  // CHECK: store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xi32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xi32>
+  // CHECK: dealloc [[RES]] : memref<10x10xi32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xi32>

-  // CHECK: return [[RET_RES]] : memref<?x10xi32>
+  // CHECK: return [[RET_RES]] : memref<10x10xi32>
 }

 func @test_exp_exp(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
@ -572,131 +544,119 @@ func @test_relu_relu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
  // CHECK: return [[RET_RES]] : memref<?x10xf32>
 }

-func @test_sum_sum(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Sum"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Sum"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Sum"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Sum"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_sum_sum
  /// First Sum
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Second Sum
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>

  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>

-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }

-func @test_max_max(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Max"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Max"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Max"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Max"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_max_max
  /// First Max
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Second Max
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>
  
-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }

-func @test_min_min(%arg0 : tensor<?x10xf32>, %arg1 : tensor<?x10xf32>) -> tensor<*xf32> {
-  %0 = "onnx.Min"(%arg0, %arg1) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<*xf32>
-  %1 = "onnx.Min"(%0, %arg1) : (tensor<*xf32>, tensor<?x10xf32>) -> tensor<*xf32>
+func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Min"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "onnx.Min"(%0, %arg1) : (tensor<*xf32>, tensor<10x10xf32>) -> tensor<*xf32>
  "std.return"(%1) : (tensor<*xf32>) -> ()

  // CHECK-LABEL: test_min_min
  /// First Min
-  // CHECK: [[DIM_0:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: [[RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
+  // CHECK: [[RET_RES:%.+]] = alloc() : memref<10x10xf32>
+  // CHECK: [[RES:%.+]] = alloc() : memref<10x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim %arg0, 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Second Min
-  // CHECK: [[DIM_0:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: [[RET_RES:%.+]] = alloc([[DIM_0]]) : memref<?x10xf32>
  // CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
  // CHECK: [[OPT_LOOPS:%.+]]:2 = krnl.optimize_loops  {
  // CHECK:   krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
  // CHECK: } : () -> (!krnl.loop, !krnl.loop)
-  // CHECK: [[DIM_2:%.+]] = dim [[RES]], 0 : memref<?x10xf32>
-  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to [[DIM_2]], [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
-  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<?x10xf32>
-  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
+  // CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
+  // CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
  // CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
  // CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
-  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<?x10xf32>
+  // CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
  
  /// Dealloc of first result.
-  // CHECK: dealloc [[RES]] : memref<?x10xf32>
-  // CHECK-NOT: dealloc [[RET_RES]] : memref<?x10xf32>
+  // CHECK: dealloc [[RES]] : memref<10x10xf32>
+  // CHECK-NOT: dealloc [[RET_RES]] : memref<10x10xf32>
  
-  // CHECK: return [[RET_RES]] : memref<?x10xf32>
+  // CHECK: return [[RET_RES]] : memref<10x10xf32>
 }

 func @test_elu_elu(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {